R/ClassFilter.R

Defines functions w4m_filter_by_sample_class w4m__nonzero_var w4m__var_by_rank_or_file w4m_filter_median_imputation w4m_filter_no_imputation zero_imputation

Documented in w4m_filter_by_sample_class w4m_filter_median_imputation w4m_filter_no_imputation w4m__nonzero_var w4m__var_by_rank_or_file

#' @title
#' Impute missing intensities to zero for W4M data matrix (deprecated)
#'
#' @description
#' Substitute zero for missing or negative intensity values in W4M data matrix (synonym for w4m_filter_zero_imputation, deprecated)
#'
#' @param m  matrix: W4M data matrix potentially containing NA or negative values
#'
#' @return matrix: input data matrix with zeros substituted for negative or NA values
#'
#' @author Art Eschenlauer, \email{esch0041@@umn.edu}
#' @concept w4m workflow4metabolomics
#' @seealso \url{https://github.com/HegemanLab/w4mclassfilter}
#' @seealso \url{http://workflow4metabolomics.org/}
#'
#' @examples
#' # input contains negative and missing values
#' my_input <- matrix(c(NA,1,-1,2), ncol = 2, nrow = 2)
#'
#' # expected output converts negative and missing values to zero
#' my_expected <- matrix(c(0,1,0,2), ncol = 2, nrow = 2)
#'
#' # run the imputation method to generate actual output
#' my_output <- w4m_filter_imputation(my_input)
#'
#' # validate actual output against expected output
#' all.equal(my_output, my_expected, check.attributes = FALSE)
#'
#' @export
w4m_filter_imputation <-
  zero_imputation <-
  function(m) {
    # replace NA values with zero
    m[is.na(m)] <- 0
    # replace negative values with zero, if applicable (It should never be applicable!)
    m[m < 0] <- 0
    # return matrix as the result
    return (m)
  }

#' @title
#' Impute missing values to zero for W4M data matrix
#'
#' @description
#' Substitute zero for missing or negative intensity values in W4M data matrix
#'
#' @param m  matrix: W4M data matrix potentially containing NA or negative values
#'
#' @return matrix: input data matrix with zeros substituted for negative or NA values
#'
#' @author Art Eschenlauer, \email{esch0041@@umn.edu}
#' @concept w4m workflow4metabolomics
#' @seealso \url{https://github.com/HegemanLab/w4mclassfilter}
#' @seealso \url{http://workflow4metabolomics.org/}
#'
#' @examples
#' # input contains negative and missing values
#' my_input <- matrix(c(NA,1,-1,2), ncol = 2, nrow = 2)
#'
#' # expected output converts negative and missing values to zero
#' my_expected <- matrix(c(0,1,0,2), ncol = 2, nrow = 2)
#'
#' # run the imputation method to generate actual output
#' my_output <- w4m_filter_zero_imputation(my_input)
#'
#' # validate actual output against expected output
#' all.equal(my_output, my_expected, check.attributes = FALSE)
#'
#' @export
w4m_filter_zero_imputation <-
  zero_imputation

#' @title
#' Do not impute missing intensities to zero for W4M data matrix, but convert negative intensities to zero
#'
#' @description
#' Substitute zero for negative intensity values in W4M data matrix, but not for missing intensity values
#'
#' @param m  matrix: W4M data matrix potentially containing negative values
#'
#' @return matrix: input data matrix with zeros substituted for negative values
#'
#' @author Art Eschenlauer, \email{esch0041@@umn.edu}
#' @concept w4m workflow4metabolomics
#' @seealso \url{https://github.com/HegemanLab/w4mclassfilter}
#' @seealso \url{http://workflow4metabolomics.org/}
#'
#' @examples
#' # input contains negative and missing values
#' my_input <- matrix(c(NA,1,-1,2), ncol = 2, nrow = 2)
#'
#' # expected output converts negative and missing values to zero
#' my_expected <- matrix(c(NA,1,0,2), ncol = 2, nrow = 2)
#'
#' # run the imputation method to generate actual output
#' my_output <- w4m_filter_no_imputation(my_input)
#'
#' # validate actual output against expected output
#' all.equal(my_output, my_expected, check.attributes = FALSE)
#'
#' @export
w4m_filter_no_imputation <-
  function(m) {
    # replace negative values with zero, if applicable
    m[m < 0] <- 0
    return (m)
  }

#' @title
#' Impute missing intensities to median for W4M data matrix
#'
#' @description
#' Substitute median feature intensity (across all samples) for missing values and zero for negative values in W4M data matrix
#'
#' @param m  matrix: W4M data matrix potentially containing NA or negative values
#'
#' @return matrix: input data matrix with zeros substituted for negative values and median substituted for missing values
#'
#' @author Art Eschenlauer, \email{esch0041@@umn.edu}
#' @concept w4m workflow4metabolomics
#' @seealso \url{https://github.com/HegemanLab/w4mclassfilter}
#' @seealso \url{http://workflow4metabolomics.org/}
#'
#' @examples
#' # input contains negative and missing values
#' my_input <- matrix(c(NA,-1,3,2), ncol = 2, nrow = 2)
#'
#' # expected output converts negative and missing values to zero
#' my_expected <- matrix(c(3,0,3,2), ncol = 2, nrow = 2)
#'
#' # run the imputation method to generate actual output
#' my_output <- w4m_filter_median_imputation(my_input)
#'
#' # validate actual output against expected output
#' all.equal(my_output, my_expected, check.attributes = FALSE)
#'
#' @export
w4m_filter_median_imputation <-
  function(m) {
    # Substitute NA with median for the row.
    # For W4M datamatrix:
    #   - each row has intensities for one feature
    #   - each column has intensities for one sample
    interpolate_row_median <- function(m) {
      # ref: https://stats.stackexchange.com/a/28578
      #   - Create a data.frame whose columns are features and rows are samples.
      #   - For each feature, substitute NA with the median value for the feature.
      t_result <- sapply(
          as.data.frame(t(m))
        , function(x) {
            x[is.na(x)] <- median(x, na.rm = TRUE)
            x
          }
        , simplify = TRUE
        )
      #   - Recover the rownames discarded by sapply.
      rownames(t_result) <- colnames(m)
      #   - Transform result so that rows are features and columns are samples.
      m <- t(t_result)
      # eliminate negative values
      m[m < 0] <- 0
      return (m)
    }
    return (interpolate_row_median(m))
  }

#' @title
#' Support function to compute variances of matrix rows or columns
#'
#' @description
#' (w4mclassfilter support function) Compute variances of rows or columns of a W4M data matrix
#'
#' @param m  matrix: W4M data matrix for which variance must be computed for rows or columns
#' @param dim  integer: For variances of rows, dim == 1, for variances of columns, dim == 2
#'
#' @return vector of numeric: variances for rows or columns
#'
#' @author Art Eschenlauer, \email{esch0041@@umn.edu}
#' @concept w4m workflow4metabolomics
#' @seealso \url{https://github.com/HegemanLab/w4mclassfilter}
#' @seealso \url{http://stackoverflow.com/a/25100036}
#'
#' @examples
#'
#' m <- base::matrix(
#'   c(
#'     1, 2, 3,
#'     5, 7, 11,
#'     13, 17, 19
#'   )
#' , nrow = 3
#' , ncol = 3
#' , byrow = TRUE
#' )
#' rowvars <- w4m__var_by_rank_or_file(m = m, dim = 1)
#' expecteds <- c(stats::var(c(1,2,3)),stats::var(c(5,7,11)),stats::var(c(13,17,19)))
#' base::all.equal(rowvars, expecteds)
#' colvars <- w4m__var_by_rank_or_file(m = m, dim = 2)
#' expecteds <- c(stats::var(c(1,5,13)),stats::var(c(2,7,17)),stats::var(c(3,11,19)))
#' base::all.equal(colvars, expecteds)
#'
#' @export
w4m__var_by_rank_or_file <- function(m, dim = 1) {
  if (dim == 1) {
    dim_x_2 <- dim(m)[2]
    if ( dim_x_2 == 0 )
      stop("w4m__var_by_rank_or_file: there are zero columns")
  }
  else if (dim == 2) {
    dim_x_1 <- dim(m)[1]
    if ( dim_x_1 == 0 ) {
      stop("w4m__var_by_rank_or_file: there are zero rows")
    }
    m <- t(m)
  }
  else {
    stop("w4m__var_by_rank_or_file: dim is invalid; for rows, use dim = 1; for colums, use dim = 2")
  }
  return(rowSums( (m - rowMeans(m)) ^ 2) / (dim(m)[2] - 1) )
}

# produce matrix from matrix xpre where all rows and columns having zero variance have been removed
#' @title
#' Support function to eliminate rows or columns that have zero variance
#'
#' @description
#' (w4mclassfilter support function) Produce matrix from matrix xpre where all rows and columns having zero variance have been removed
#'
#' @param m  matrix: W4M data matrix potentially having rows or columns having zero variance
#'
#' @return matrix: input data matrix after removal of rows or columns having zero variance
#'
#' @examples
#'
#' m <- matrix(
#'   c(
#'     1, 2, 3, 4,
#'     3, 3, 3, 4,
#'     5, 7, 11, 4,
#'     13, 17, 19, 4
#'   )
#' , nrow = 4
#' , ncol = 4
#' , byrow = TRUE
#' )
#' rownames(m) <- c("A", "B", "C", "D")
#' colnames(m) <- c("W", "X", "Y", "Z")
#' expected <- matrix(
#'   c(
#'     1, 2, 3,
#'     5, 7, 11,
#'     13, 17, 19
#'   )
#'   , nrow = 3
#'   , ncol = 3
#'   , byrow = TRUE
#' )
#' rownames(expected) <- c("A", "C", "D")
#' colnames(expected) <- c("W", "X", "Y")
#' all.equal(w4m__nonzero_var(m), expected)
#'
#' @export
w4m__nonzero_var <- function(m) {
  nonzero_var_internal <- function(x) {
    if (nrow(x) == 0) {
      utils::str(x)
      stop("matrix has no rows")
    }
    if (ncol(x) == 0) {
      utils::str(x)
      stop("matrix has no columns")
    }
    # exclude any rows or columns with zero variance
    if ( is.numeric(x) ) {
      # exclude any rows with zero variance
      if ( nrow(x) > 0) {
        row_vars <- w4m__var_by_rank_or_file(x, dim = 1)
        nonzero_row_vars <- row_vars > 0
        nonzero_rows <- row_vars[nonzero_row_vars]
        if ( length(rownames(x)) != length(rownames(nonzero_rows)) ) {
          row.names <- attr(nonzero_rows, "names")
          x <- x[ row.names, , drop = FALSE ]
        }
      }
      # exclude any columns with zero variance
      if ( ncol(x) > 0) {
        column_vars <- w4m__var_by_rank_or_file(x, dim = 2)
        nonzero_column_vars <- column_vars > 0
        nonzero_columns <- column_vars[nonzero_column_vars]
        if ( length(colnames(x)) != length(colnames(nonzero_columns)) ) {
          column.names <- attr(nonzero_columns, "names")
          x <- x[ , column.names, drop = FALSE ]
        }
      }
    }
    return (x)
  }

  # purge rows and columns that have zero variance until there are nor more changes
  #   rationale: nonzero_var_internal first purges rows with zero variance, then columns with zero variance,
  #              so there exists the possibility that a row's variance might change to zero when a column is removed;
  #              therefore, iterate till there are no more changes
  if ( is.numeric(m) ) {
    my.nrow <- 0
    my.ncol <- 0
    while ( nrow(m) != my.nrow || ncol(m) != my.ncol ) {
      my.nrow <- nrow(m)
      my.ncol <- ncol(m)
      m <- nonzero_var_internal(m)
    }
  }
  return (m)
}

#' @title
#' Filter W4M data matrix by sample-class
#'
#' @description
#' Filter a set of retention-corrected W4M files (dataMatrix, sampleMetadata, variableMetadata) by sample-class or feature-attributes
#'
#' @details
#' The W4M files dataMatrix, sampleMetadata, and variableMetadata must be a consistent set, i.e.,
#' there must be metadata in the latter two files for all (and only for) the samples and variables named in the columns and rows of dataMatrix.
#'
#' For multivariate statistics functions, samples and variables with zero variance must be eliminated, and missing values are problematic.
#'
#' Furthermore, frequently, it is desirable to analyze a subset of samples (or features) in the dataMatrix.
#'
#' This function manipulates produces a set of files with imputed missing values, omitting features and samples that are not consistently present within the set or have zero variance.
#' Secondly, it provides a selection-capability for samples based on whether their sample names match a regular expression pattern; this capability can be used either to select for samples with matching sample names or to exclude them.
#' Thirdly, it provides a selection-capability for features based on whether their metadata lie within the ranges specified by 'variable_range_filter'.
#'
#' Finally, this function provides as an advanced option to compute one of three types of centers for each treatment:
#' * "centroid" - Return only treatment-centers computed for each treatment as the mean intensity for each feature.
#' * "median" - Return only treatment-centers computed for each treatment as the median intensity for each feature.
#' * "medoid" - Return only treatment-centers computed for each treatement as the sample most similar to the other samples (the medoid).
#'   * By definition, the medoid is the sample having the smallest sum of its distances from other samples in the treatment.
#'   * Distances computed in principal-components space.
#'     * Principal components are uncorrelated, so they are used here to minimize the distortion of computed distances by correlated features.
#' * "none" - Return all samples; do not computing centers
#'
#' Inputs (dataMatrix_in, sampleMetadata_in, variableMetadata_in) may be:
#' * character: path to input tab-separated-values-file (TSV)
#' * data.frame
#' * matrix: allowed for dataMatrix_in only
#' * list: must have a member named "dataMatrix", "sampleMetadata", or "variableMetadata" for dataMatrix_in, sampleMetadata_in, or variableMetadata_in, respectively.
#' * environment: must have a member named "dataMatrix", "sampleMetadata", or "variableMetadata" for dataMatrix_in, sampleMetadata_in, or variableMetadata_in, respectively.
#'
#' Outputs (dataMatrix_out, sampleMetadata_out, variableMetadata_out) may be:
#' * character: path to write a tab-separated-values-file (TSV)
#' * list: will add a member named "dataMatrix", "sampleMetadata", or "variableMetadata" for dataMatrix_out, sampleMetadata_out, or variableMetadata_out, respectively.
#' * environment: will add a member named "dataMatrix", "sampleMetadata", or "variableMetadata" for dataMatrix_out, sampleMetadata_out, or variableMetadata_out, respectively.
#'
#' Please see the package vignette for further details.
#'
#' @param dataMatrix_in          input  data matrix (rows are feature names, columns are sample names
#' @param sampleMetadata_in      input  sample metadata (rows are sample names, one column's name matches class_column)
#' @param variableMetadata_in    input  variable metadata (rows are variable names)
#' @param dataMatrix_out         output data matrix (rows are feature names, columns are sample names
#' @param sampleMetadata_out     output sample metadata (rows are sample names, one column's name matches class_column)
#' @param variableMetadata_out   output variable metadata (rows are variable names)
#' @param classes                character vector or csv string: names of sample classes to include or exclude; default is an empty vector
#' @param include                logical: TRUE, include named sample classes; FALSE (the default), exclude named sample classes
#' @param class_column           character: name of "class" column, defaults to "class"
#' @param samplename_column      character: name of column with sample name, defaults to "sampleMetadata"
#' @param name_varmetadata_col1  logical: TRUE, name column 1 of variable metadata as "variableMetadata"; FALSE, no change; default is TRUE
#' @param name_smplmetadata_col1 logical: TRUE, name column 1 of sample metadata as "sampleMetadata"; FALSE, no change; default is TRUE
#' @param variable_range_filter  character vector or csv string: vector of filters specified as 'variableMetadataColumnName:min:max'; default is empty vector
#' @param data_imputation        function(m): default imputation method for 'intb' data, where intensities have background subtracted - impute zero for NA
#' @param order_vrbl             character vector or csv string: name(s) of column(s) of variableMetadata on which to sort, defaults to "variableMetadata" (i.e., the first column)
#' @param order_smpl             character vector or csv string: name(s) of column(s) of sampleMetadata on which to sort, defaults to "sampleMetadata" (i.e., the first column)
#' @param centering              character: center samples by class column (which names treatment).  Possible choices: "none", "centroid", "medoid", or "median"
#' @param failure_action         function(x, ...): action to take upon failure - defaults to 'print(x,...)'
#'
#' @return logical: TRUE only if filtration succeeded
#'
#' @author Art Eschenlauer, \email{esch0041@@umn.edu}
#' @concept w4m workflow4metabolomics
#' @seealso \url{https://github.com/HegemanLab/w4mclassfilter}
#' @seealso \url{http://workflow4metabolomics.org/}
#'
#' @importFrom utils read.delim write.table str
#' @importFrom stats median prcomp dist
#'
#' @examples
#' \dontrun{
#'   # set the paths to your input files
#'   dataMatrix_in <- "tests/testthat/input_dataMatrix.tsv"
#'   sampleMetadata_in <- "tests/testthat/input_sampleMetadata.tsv"
#'   variableMetadata_in <- "tests/testthat/input_variableMetadata.tsv"
#'
#'   # set the paths to your (nonexistent) output files
#'   #    in a directory that DOES need to exist
#'   dataMatrix_out <- "tests/testthat/output_dataMatrix.tsv"
#'   sampleMetadata_out <- "tests/testthat/output_sampleMetadata.tsv"
#'   variableMetadata_out <- "tests/testthat/output_variableMetadata.tsv"
#'
#'   # Example: running the filter to exclude only unwanted samples
#'   #   include = FALSE means exclude samples with class blankpos
#'   w4m_filter_by_sample_class(
#'     dataMatrix_in = dataMatrix_in
#'   , dataMatrix_out = dataMatrix_out
#'   , variableMetadata_in = variableMetadata_in
#'   , variableMetadata_out = variableMetadata_out
#'   , sampleMetadata_out = sampleMetadata_out
#'   , sampleMetadata_in = sampleMetadata_in
#'   , classes = c("M")
#'   , include = TRUE
#'   , class_column = "gender"
#'   , samplename_column = "sampleMetadata"
#'   , name_varmetadata_col1 = TRUE
#'   , name_smplmetadata_col1 = TRUE
#'   , variable_range_filter = c()
#'   , data_imputation = w4m_filter_zero_imputation
#'   , order_vrbl = "variableMetadata"
#'   , order_smpl = "sampleMetadata"
#'   , centering  = "none"
#'   , failure_action = function(...) { cat(paste(..., SEP = "\n")) }
#'   )
#' }
#'
#' @export
w4m_filter_by_sample_class <- function(
  dataMatrix_in                           # character:          path to input file containing data matrix (tsv, rows are feature names, columns are sample names)
, sampleMetadata_in                       # character:          path to input file containing sample metadata (tsv, rows are sample names, one column is "class")
, variableMetadata_in                     # character:          path to input file containing variable metadata (tsv, rows are variable names)
, dataMatrix_out                          # character:          path to output file containing data matrix (tsv, rows are feature names, columns are sample names)
, sampleMetadata_out                      # character:          path to output file containing sample metadata (tsv, rows are sample names, one column is "class")
, variableMetadata_out                    # character:          path to output file containing variable metadata (tsv, rows are variable names)
, classes = c()                           # char array or csv:  names of sample classes to include or exclude (as csv string or vector of strings); default is an empty array
, include = FALSE                         # logical:            TRUE, include named sample classes; FALSE (the default), exclude named sample classes
, class_column = "class"                  # character:          name of "class" column, defaults to "class"
, samplename_column = "sampleMetadata"    # character:          name of column with sample name, defaults to "sampleMetadata"
, name_varmetadata_col1 = TRUE            # logical:            TRUE, name column 1 of variable metadata as "variableMetadata"; FALSE, no change; default is TRUE
, name_smplmetadata_col1 = TRUE           # logical:            TRUE, name column 1 of sample metadata as "sampleMetadata"; FALSE, no change; default is TRUE
, variable_range_filter = c()             # char array or csv:  array of filters specified as 'variableMetadataColumnName:min:max'; default is empty array
, data_imputation = w4m_filter_zero_imputation   # function(m): default imputation method is for 'intb' data, where intensities have background subtracted - impute zero for NA or negative
, order_vrbl = "variableMetadata"         # char array or csv:  order variables by column(s) whose name(s) is/are supplied here (as csv string or vector of strings)
, order_smpl = "sampleMetadata"           # char array or csv:  order samples by column(s) whose name(s) is/are supplied here (as csv string or vector of strings)
, centering  = c("none", "centroid", "median", "medoid")[1]   # character: center samples by class column (which names treatment)
, failure_action = function(...) { cat(paste(..., SEP = "\n")) }   # function(x, ...):   action to take upon failure - defaults to 'print(x,...)'
) {

  (my_failure_action <- (failure_action))
  # ---
  # define internal functions
  # ...

  # read_data_frame - read a w4m data frame, with error handling
  #   e.g., data_matrix_input_env <- read_data_frame(dataMatrix_in, "data matrix input")
  read_data_frame <- function(file_path, kind_string, failure_action = my_failure_action) {
    # ---
    # read in the data frame
    my.env <- new.env()
    my.env$success <- FALSE
    my.env$msg <- sprintf("no message reading %s", kind_string)
    if (!file.exists(file_path)) {
      my.env$msg <- sprintf("file '%s' not found when trying to read %s", file_path, kind_string)
      failure_action(my.env$msg)
      return ( NULL )
    }
    tryCatch(
      expr = {
        my.env$data    <- utils::read.delim( fill = FALSE, file = file_path, stringsAsFactors = FALSE )
        my.env$success <- TRUE
      }
    , error = function(e) {
       my.env$ msg <- sprintf("%s read failed", kind_string)
      }
    )
    if (!my.env$success) {
      failure_action(my.env$msg)
      return ( NULL )
    }
    return (my.env)
  }

  # return FALSE if any paths are exact duplicates
  #   N.B. This does not check for different relative paths that resolve to the same file.
  my_list <- list(dataMatrix_in, dataMatrix_out, sampleMetadata_in, sampleMetadata_out, variableMetadata_in, variableMetadata_out)
  my.paths <- unlist(my_list[sapply(my_list, is.character)])
  if ( length(my.paths) != length(unique(my.paths)) ) {
    failure_action("some paths are duplicated")
    for ( my.path in my.paths ) {
      failure_action(my.path)
    }
    stop("some paths are duplicated")
    return (FALSE)
  }

  # tryCatchFunc wraps an expression that produces a value if it does not stop:
  #   tryCatchFunc produces a list
  #   On success of expr(), tryCatchFunc produces
  #     list(success TRUE, value = expr(), msg = "")
  #   On failure of expr(), tryCatchFunc produces
  #     list(success = FALSE, value = NA, msg = "the error message")
  tryCatchFunc <- function(expr) {
    # format error for logging
    format_error <- function(e) {
      return(
        paste(
          c("Error { message:"
          , e$message
          , ", call:"
          , e$call
          , "}"
          )
        , collapse = " "
        )
      )
    }
    retval <- NULL
    tryCatch(
      expr = {
        retval <- ( list( success = TRUE, value = eval(expr = expr), msg = "" ) )
      }
      , error = function(e) {
        retval <<- list( success = FALSE, value = NA, msg = format_error(e) )
      }
    )
    return (retval)
  }

  # read one of three XCMS data elements: dataMatrix, sampleMetadata, variableMetadata
  # returns respectively: matrix, data.frame, data.frame, or FALSE if there is a failure
  read_xcms_data_element <- function(xcms_data_in, xcms_data_type, failure_action = stop) {
    my_failure_action <- function(...) {
      failure_action("w4mclassfilter::w4m_filter_by_sample_class::read_xcms_data_element: ", ...)
    }
    # xcms_data_type must be in c("sampleMetadata", "variableMetadata", "dataMatrix")
    if ( ! is.character(xcms_data_type) ) {
      my_failure_action(sprintf("bad parameter xcms_data_type '%s'", deparse(xcms_data_type)))
      return ( NULL )
    }
    if ( 1 != length(xcms_data_type)
         || ! ( xcms_data_type %in% c("sampleMetadata", "variableMetadata", "dataMatrix") )
    ) {
      my_failure_action( sprintf("bad parameter xcms_data_type '%s'", xcms_data_type) )
      return ( NULL )
    }
    if ( is.character(xcms_data_in) ){
      # case: xcms_data_in is a path to a file
      xcms_data_input_env <- read_data_frame(
        xcms_data_in
      , sprintf("%s input", xcms_data_type)
      )
      if (is.null(xcms_data_input_env)) {
        action_msg <- sprintf(
            "read_data_frame failed for '%s', data type '%s'"
          , toString(xcms_data_in)
          , xcms_data_type)
        my_failure_action(action_msg)
        return ( NULL )
      }
      if (!xcms_data_input_env$success) {
        my_failure_action(xcms_data_input_env$msg)
        my_failure_action("w4mclassfilter::w4m_filter_by_sample_class::read_xcms_data_element is returning NULL")
        return ( NULL )
      }
      return (xcms_data_input_env$data)
    } else if ( is.data.frame(xcms_data_in) || is.matrix(xcms_data_in) ) {
      # case: xcms_data_in is a data.frame or matrix
      return(xcms_data_in)
    } else if ( is.list(xcms_data_in) || is.environment(xcms_data_in) ) {
      # NOTE WELL: is.list succeeds for data.frame, so the is.data.frame test must appear before the is.list test
      # case: xcms_data_in is a list
      if ( ! exists(xcms_data_type, where = xcms_data_in) ) {
        my_failure_action(
          sprintf(
            "%s xcms_data_in is missing member '%s'"
          , ifelse(
              is.environment(xcms_data_in)
            , "environment"
            , "list"
            )
          , xcms_data_type
          )
        )
        return (NULL)
      }
      prospect <- getElement(name = xcms_data_type, object = xcms_data_in)
      if ( ! is.data.frame(prospect) && ! is.matrix(prospect) ) {
        utils::str("list - str(prospect)")
        utils::str(prospect)
        if ( is.list(xcms_data_in) ) {
          my_failure_action(sprintf("the first member of xcms_data_in['%s'] is neither a data.frame nor a matrix but is a %s", xcms_data_type, typeof(prospect)))
        } else {
          my_failure_action(sprintf("the first member of xcms_data_in$%s is neither a data.frame nor a matrix but is a %s", xcms_data_type, typeof(prospect)))
        }
        return (prospect)
      }
      return (prospect)
    } else {
      # case: xcms_data_in is invalid
      my_failure_action( sprintf("xcms_data_in has unexpected type %s", typeof(xcms_data_in)) )
      return (NULL)
    }
  }

  # ---
  # entrypoint
  # ...

  # ---
  # read in the sample metadata
  read_data_result <- tryCatchFunc(
    expr = {
      read_xcms_data_element(
        xcms_data_in = sampleMetadata_in
      , xcms_data_type = "sampleMetadata"
      )
    }
  )
  if (is.null(read_data_result)) {
    failure_action("read_xcms_data_element returnd null; aborting w4m_filter_by_sample_class")
  }
  if ( read_data_result$success ) {
    smpl_metadata <- read_data_result$value
  } else {
    failure_action(read_data_result$msg)
    return (FALSE)
  }

  # extract rownames
  if (name_smplmetadata_col1) {
    colnames(smpl_metadata)[1] <- "sampleMetadata"
  }
  rownames(smpl_metadata) <- make.names(smpl_metadata[ , samplename_column], unique = TRUE)

  if (length(classes) == 1) {
    classes <- unlist(strsplit( classes, "," ))
  }
  if (nchar(class_column) > 0 && length(classes) > 0) {
    # select the first column of the rows indicated by classes, include, & class_column, but don't drop dimension
    #   > Reduce(`|`,list(c(TRUE,FALSE,FALSE),c(FALSE,TRUE,FALSE),c(FALSE,FALSE,FALSE)))
    #   [1]  TRUE  TRUE FALSE
    #   > Reduce(`|`,lapply(X=c("[aC]", "[Ab]"), FUN = function(pattern) {grepl(pattern = pattern, x = c("b", "ba", "c", "ab", "bcb")) }))
    #   [1]  TRUE  TRUE FALSE  TRUE  TRUE
    selected_rows <- smpl_metadata[
      xor(
        !include
      , Reduce(
          `|`
        , lapply(X = classes, FUN = function(pattern) {
            grepl(pattern = pattern, x = smpl_metadata[ , class_column])
          })
        )
      )
    , 1
    , drop = FALSE
    ]

    # obtain the row names
    sample_names <- rownames( selected_rows )
  } else {
    sample_names <- rownames( smpl_metadata )
  }
  # ...

  # ---
  # read in the variable metadata
  read_data_result <- tryCatchFunc(
    expr = {
      read_xcms_data_element(xcms_data_in = variableMetadata_in, xcms_data_type = "variableMetadata")
    }
  )
  if (is.null(read_data_result)) {
    failure_action("read_xcms_data_element returnd null; aborting w4m_filter_by_sample_class")
  }
  if ( read_data_result$success ) {
    vrbl_metadata <- read_data_result$value
  } else {
    failure_action(read_data_result$msg)
    return (FALSE)
  }


  # extract rownames (using make.names to handle degenerate feature names)
  err.env <- new.env()
  err.env$success <- FALSE
  err.env$msg <- "no message setting vrbl_metadata rownames"
  tryCatch(
    expr = {
      rownames(vrbl_metadata) <- make.names( vrbl_metadata[ , 1 ], unique = TRUE )
      vrbl_metadata[ , 1 ] <- rownames(vrbl_metadata)
      if (name_varmetadata_col1) {
        colnames(vrbl_metadata)[1] <- "variableMetadata"
      }
      err.env$success     <- TRUE
    }
  , error = function(e) {
     err.env$ msg <- sprintf("failed to set rownames for vrbl_metadata read because '%s'", e$message)
    }
  )
  if (!err.env$success) {
    failure_action(err.env$msg)
    return ( FALSE )
  }
  # ...

  # ---
  # read in the data matrix
  #   For W4M, each row has intensities for one feature and each column has intensities for one sample
  read_data_result <- tryCatchFunc(
    expr = {
      read_xcms_data_element(xcms_data_in = dataMatrix_in, xcms_data_type = "dataMatrix")
    }
  )
  if (is.null(read_data_result)) {
    failure_action("read_xcms_data_element returnd null; aborting w4m_filter_by_sample_class")
  }
  if ( read_data_result$success ) {
    data_matrix <- read_data_result$value
  } else {
    failure_action(read_data_result$msg)
    return (FALSE)
  }

  if ( ! is.matrix(data_matrix) ) {
    # extract rownames (using make.names to handle degenerate feature names)
    err.env <- new.env()
    err.env$success <- FALSE
    err.env$msg <- "no message setting data_matrix rownames"
    tryCatch(
      expr = {
        rownames(data_matrix) <- make.names( data_matrix[ , 1 ], unique = TRUE )
        colnames(data_matrix) <- make.names( colnames(data_matrix), unique = TRUE )
        err.env$success     <- TRUE
      }
    , error = function(e) {
       err.env$msg <- sprintf("failed to set rownames for data_matrix read because '%s'", e$message)
      }
    )
    if (!err.env$success) {
      failure_action(err.env$msg)
      return ( FALSE )
    }

    # remove rownames column
    data_matrix <- data_matrix[ , 2:ncol(data_matrix) ]

    # select the subset of samples indicated by classes, include, & class_column
    data_matrix <- data_matrix[ , intersect(sample_names, colnames(data_matrix)), drop = FALSE ]

    # convert data_matrix to matrix from data.frame
    data_matrix <- as.matrix(data_matrix)
  }
  # ...

  # order_metadata( metadata_df, metadata_names, order_cols )
  order_metadata <- function(metadata_df, metadata_names, order_cols) {
    if ( length(order_cols) == 1 ) {
      order_split <- unlist(strsplit( order_cols, "," ))
    } else {
      order_split <- order_cols
    }
    metadata_order <-
      Reduce(
        function(a1, a2) {
          order(
            metadata_df[metadata_names, a1],
            metadata_df[metadata_names, a2]
          )
        },
        order_split
      )
    return (metadata_order)
  }

  data_matrix_old <- data_matrix
  # Impute missing values with supplied or default method
  #   (necessary for w4m__nonzero_var)
  data_matrix <- zero_imputation(data_matrix)
  # The user-supplied data_imputation function may include transformation, so it's necessary
  #   to apply it here.  The MAXFEAT filter won't work correctly without this step.
  #   (This addresses https://github.com/HegemanLab/w4mclassfilter/issues/5)
  data_matrix <- data_imputation(data_matrix)

  # ---
  # purge unwanted data from data_matrix
  some_data_were_eliminated <- TRUE
  while (some_data_were_eliminated) {
    # count rows and columns before elimination
    nrow_before <- nrow(data_matrix)
    ncol_before <- ncol(data_matrix)

    if (length(variable_range_filter) == 1) {
      variable_range_filter <- unlist(strsplit( variable_range_filter, "," ))
    }
    # run filters for variable metadata and maximum intensity for each feature
    if (length(variable_range_filter) > 0) {
      # filter variables having out-of-range metadata or intensity maximum
      for (variable_range_filter_string in variable_range_filter) {
        variable_range_filter_string <- sub(":$", ":NA",  variable_range_filter_string)
        variable_range_filter_string <- sub("::", ":NA:", variable_range_filter_string)
        split_list <- strsplit(x = variable_range_filter_string, split = ":", fixed = TRUE)
        if ( length(split_list) == 1 ) {
          split_strings <- split_list[[1]]
          if ( length(split_strings) == 3 ) {
            filter_col <- split_strings[1]
            filter_min <- tryCatch({ as.numeric(split_strings[2]) }, warning = function(w){ -Inf })
            filter_max <- tryCatch({ as.numeric(split_strings[3]) }, warning = function(w){ Inf })
            vrbl_colnames <- colnames(vrbl_metadata)
            if ( filter_col %in% vrbl_colnames ) {
              row_value <- vrbl_metadata[filter_col]
              if (filter_min <= filter_max) {
                # filter specifies an inclusion range
                keep_row <- row_value >= filter_min & row_value <= filter_max
              } else {
                # filter specifies an exclusion range
                keep_row <- row_value > filter_min | row_value < filter_max
              }
              vrbl_metadata <- vrbl_metadata[ keep_row , ]
            } else if (filter_col == "FEATMAX") {
              # apply the function 'max' to rows (1, columns would be 2) of data_matrix
              row_maxima <- apply(data_matrix, 1, max, na.rm = TRUE)
              if (filter_min <= filter_max) {
                # filter specifies an inclusion range
                keep_row <- row_maxima >= filter_min & row_maxima <= filter_max
                data_matrix <- data_matrix[ keep_row , ]
              } else {
                warning("w4m_filter_by_sample_class: FEATMAX filter specified but not applied")
              }
            }
          } else {
            warning("w4m_filter_by_sample_class: split_list is not of the expected length")
          }
        }
      }
    }

    # purge data_matrix of rows and columns that have zero variance
    data_matrix <- w4m__nonzero_var(data_matrix)

    # count rows and columns after elimination
    nrow_after <- nrow(data_matrix)
    ncol_after <- ncol(data_matrix)
    some_data_were_eliminated <- nrow_before != nrow_after | ncol_before != ncol_after
  }

  # purge smpl_metadata and vrbl_metadata of irrelevant rows
  # column names
  sample_names <- intersect(sample_names, colnames(data_matrix))
  sample_order <- if ( length(order_smpl) == 1 && ! grepl( ",", order_smpl ) ) {
    order(smpl_metadata[sample_names, order_smpl])
  } else {
    order_metadata( smpl_metadata, sample_names, order_smpl )
  }
  sample_names <- sample_names[sample_order]
  smpl_metadata <- smpl_metadata[sample_names, ]
  # row names
  variable_names <- intersect( rownames(vrbl_metadata), rownames(data_matrix) )
  variable_order <- if ( length(order_vrbl) == 1 && ! grepl( ",", order_vrbl ) ) {
    order(vrbl_metadata[variable_names, order_vrbl])
  } else {
    order_metadata( vrbl_metadata, variable_names, order_vrbl )
  }
  variable_names <- variable_names[variable_order]
  vrbl_metadata <- vrbl_metadata[variable_names, ]

  # Impute missing values with supplied or default method and the ORIGINAL dataMatrix
  #   This is to avoid biasing median-imputation toward the center of the selected features and samples.
  data_matrix <- data_imputation(data_matrix_old)


  # filter out undesired features and samples
  data_matrix <- data_matrix[variable_names, sample_names, drop = FALSE ]
  # ...

  # ---
  if (centering == "centroid") {
    treatments <- smpl_metadata[class_column][[1]]
    nrow_dm <- nrow(data_matrix)
    unitrts <- unique(treatments)
    smpl_metadata <- data.frame(
      trt = unitrts,
      n = sapply(X = unitrts, FUN = function(x) sum(x == treatments)),
      stringsAsFactors = FALSE
    )
    sample_names <- unitrts[order(unitrts)]
    # for each treatment, calculate the mean intensity for each feature
    new_df <- as.data.frame(
      sapply(
        X = unitrts,
        FUN = function(x) {
          unitrt <- x
          sapply(
            X = 1:nrow_dm,
            FUN = function(x) {
              mean(data_matrix[x, unitrt == treatments])
            }
          )
        }
      ),
      stringsAsFactors = FALSE
    )
    smpl_metadata_colnames <- colnames(smpl_metadata)
    smpl_metadata$sampleMetadata <- smpl_metadata[,"trt"]
    smpl_metadata <- smpl_metadata[c("sampleMetadata",smpl_metadata_colnames)]
    rownames(new_df) <- rownames(data_matrix)
    data_matrix <- as.matrix(new_df)
  }
  else if (centering == "median") {
    treatments <- smpl_metadata[class_column][[1]]
    nrow_dm <- nrow(data_matrix)
    unitrts <- unique(treatments)
    smpl_metadata <- data.frame(
      trt = unitrts,
      n = sapply(X = unitrts, FUN = function(x) sum(x == treatments)),
      stringsAsFactors = FALSE
    )
    sample_names <- unitrts[order(unitrts)]
    # for each treatment, calculate the median intensity for each feature
    new_df <- as.data.frame(
      sapply(
        X = unitrts,
        FUN = function(x) {
          unitrt <- x
          sapply(
            X = 1:nrow_dm,
            FUN = function(x) {
              median(data_matrix[x, unitrt == treatments])
            }
          )
        }
      ),
      stringsAsFactors = FALSE
    )
    smpl_metadata_colnames <- colnames(smpl_metadata)
    smpl_metadata$sampleMetadata <- smpl_metadata[,"trt"]
    smpl_metadata <- smpl_metadata[c("sampleMetadata",smpl_metadata_colnames)]
    rownames(new_df) <- rownames(data_matrix)
    data_matrix <- as.matrix(new_df)
  }
  else if (centering == "medoid") {
    # compute medoid (ref: https://www.biostars.org/p/11987/#11989)
    #medoid_col <- function(trt) names(which.min(rowSums(as.matrix(dist(t(trt))))))
    medoid_row  <- function(trt) names(which.min(rowSums(as.matrix(dist(  trt )))))

    treatments <- smpl_metadata[,class_column]
    unitrts <- unique(treatments)

    # When computing principal components with prcomp, set scale. to TRUE because, according to
    #   https://stat.ethz.ch/R-manual/R-devel/library/stats/html/prcomp.html,
    # "in general scaling is advisable".
    my_pca <- prcomp(t(data_matrix), scale. = TRUE, tol = sqrt(.Machine$double.eps))
    # Extract eigenvalues to determine how many are < 1
    # ref for extraction: https://stat.ethz.ch/pipermail/r-help/2005-August/076610.html
    ev <- my_pca$sdev ^ 2
    # The cut-off for the scree (the point of diminishing returns when adding additional
    # principal components) is somewhat arbitrary:
    #   https://en.wikipedia.org/wiki/Scree_plot
    # which cites
    #   Norman and Steiner, Biostatistics: The Bare Essentials, p. 201
    #   (https://books.google.com/books?id=8rkqWafdpuoC&pg=PA201)
    # To be conservative, limit the number of PCs to twice the number of eigenvalues that are greater than 1.
    # It might be better instead to keep adding components until the residual approaches some threshold.
    my_rank <- min(length(ev),2 * sum(ev > 1))
    my_scores <- my_pca$x
    my_scores <- my_scores[,1:min(ncol(my_scores),my_rank)]

    # For each treatment, calculate the medoid, i.e.,
    #   the name of the sample with the minimum distance to the other samples in the trt;
    #   That sample is the the one with the lowest distance from the center in principal components space.
    my_sapply_result <- sapply(
      X = unitrts,
      FUN = function(x) {
        unitrt <- x
        if (sum(treatments == unitrt) > 1) {
          my_trt_scores <- my_scores[treatments == unitrt, ]
          my_trt_medoid <- medoid_row(my_trt_scores)
          return (my_trt_medoid)
        } else {
          my_trt_medoid <- rownames(my_scores[treatments == unitrt, , drop=FALSE])
          return (my_trt_medoid)
        }
      }
    )

    data_matrix <- data_matrix[ , my_sapply_result, drop = FALSE ]
    # rewrite smpl_metadata:
    #   - rename column 1 as "medoid"
    #   - copy class_column into column 1 as "sampleMetadata"

    smpl_metadata <- smpl_metadata[my_sapply_result, , drop = FALSE]
    colnames(smpl_metadata)[1] <- "medoid"
    smpl_metadata_colnames <- colnames(smpl_metadata)
    smpl_metadata$sampleMetadata <- smpl_metadata[,class_column]
    smpl_metadata <- smpl_metadata[c("sampleMetadata",smpl_metadata_colnames)]
    rownames(smpl_metadata) <- smpl_metadata$sampleMetadata
    # rename data_matrix columns as class
    colnames(data_matrix) <- smpl_metadata[,class_column]
    # reset sample_names
    sample_names <- smpl_metadata$sampleMetadata
    sample_order <- if ( length(order_smpl) == 1 && ! grepl( ",", order_smpl ) ) {
      order(smpl_metadata[sample_names, order_smpl])
    } else {
      order_metadata( smpl_metadata, sample_names, order_smpl )
    }
    sample_names <- sample_names[sample_order]

    colnames(data_matrix) <- make.names(colnames(data_matrix), unique = TRUE)
    rownames(smpl_metadata) <- make.names(rownames(smpl_metadata), unique = TRUE)
    sample_names <- make.names(sample_names, unique = TRUE)
    smpl_metadata[1] <- make.names(smpl_metadata[,class_column], unique = TRUE)
  }
  # ...

  # ---
  # write out the results
  err.env <- new.env()
  err.env$success <- FALSE
  err.env$msg <- "no message writing output files"
  err.env$trace <- "trace string:"
  tryCatch(
    expr = {
      err.env$trace <- paste(err.env$trace, "A")
      # sort matrix by supplied criteria
      sub_matrix <- data_matrix[
          rownames(data_matrix) %in% variable_names # row selector
        , colnames(data_matrix) %in% sample_names   # column selector
        , drop = FALSE                              # keep two dimensions
        ]
      err.env$trace <- paste(err.env$trace, "B")
      # sort matrix to match order of variable_names and sample_names
      sorted_matrix <- sub_matrix[
          match(rownames(sub_matrix),variable_names)
        , match(colnames(sub_matrix),sample_names)
        ]
      err.env$trace <- paste(err.env$trace, "C")

      # write the data matrix
      if ( is.character(dataMatrix_out) ){

        # write the results
        utils::write.table(  x = sorted_matrix
                           , file = dataMatrix_out
                           , sep = "\t"
                           , quote = FALSE
                           , row.names = TRUE
                           , col.names = NA
                           )
      } else if ( is.environment(dataMatrix_out) || (is.list(dataMatrix_out) && ! is.matrix(dataMatrix_out)) ) {
        dataMatrix_out$dataMatrix <- sorted_matrix
      } else {
        stop(sprintf("w4m_filter_by_sample_class: dataMatrix_out has unexpected type %s"), typeof(dataMatrix_out))
        return (FALSE)
      }

      # write the sample metadata
      if ( is.character(sampleMetadata_out) ){
        utils::write.table( x = smpl_metadata
                             [ match(rownames(smpl_metadata), sample_names) # row selector
                             ,              # column selector (select all)
                             , drop = FALSE # keep two dimensions
                             ]
                           , file = sampleMetadata_out
                           , sep = "\t"
                           , quote = FALSE
                           , row.names = FALSE
                           )
      } else if ( is.environment(sampleMetadata_out) || (is.list(sampleMetadata_out) && ! is.matrix(sampleMetadata_out)) ) {
        sampleMetadata_out$sampleMetadata <-
          smpl_metadata [ match(rownames(smpl_metadata), sample_names) # row selector
                        ,              # column selector (select all)
                        , drop = FALSE # keep two dimensions
                        ]
        rownames(sampleMetadata_out$sampleMetadata) <- 1:nrow(sampleMetadata_out$sampleMetadata)
        sampleMetadata_out$sampleMetadata$sampleMetadata <- as.factor(sampleMetadata_out$sampleMetadata$sampleMetadata)
        sampleMetadata_out$sampleMetadata <- droplevels(sampleMetadata_out$sampleMetadata)
      } else {
        stop(sprintf("w4m_filter_by_sample_class: sampleMetadata_out has unexpected type %s"), typeof(sampleMetadata_out))
        return (FALSE)
      }

      # write the variable metadata
      if ( is.character(variableMetadata_out) ){
        utils::write.table( x = vrbl_metadata
                             [ variable_names # row selector
                             ,                # column selector (select all)
                             , drop = FALSE   # keep two dimensions
                             ]
                           , file = variableMetadata_out
                           , sep = "\t"
                           , quote = FALSE
                           , row.names = FALSE
                           )
      } else if ( is.environment(variableMetadata_out) || (is.list(variableMetadata_out) && ! is.matrix(variableMetadata_out)) ) {
        variableMetadata_out$variableMetadata <-
          vrbl_metadata[ variable_names # row selector
                       ,                # column selector (select all)
                       , drop = FALSE   # keep two dimensions
                       ]
        rownames(variableMetadata_out$variableMetadata) <- 1:nrow(variableMetadata_out$variableMetadata)
        variableMetadata_out$variableMetadata$variableMetadata <- as.factor(variableMetadata_out$variableMetadata$variableMetadata)
        variableMetadata_out$variableMetadata <- droplevels(variableMetadata_out$variableMetadata)
      } else {
        stop(sprintf("w4m_filter_by_sample_class: variableMetadata_out has unexpected type %s"), typeof(variableMetadata_out))
        return (FALSE)
      }

      err.env$success     <- TRUE
    }
  , error = function(e) {
     err.env$ msg <- sprintf("failed to write output files because '%s'; %s", e$message, err.env$trace)
    }
  )
  # ...

  # ---
  # report results
  if (!err.env$success) {
    failure_action(err.env$msg)
    return ( FALSE )
  } else {
    return (TRUE)
  }
  # ...
}

# vim: sw=2 ts=2 et ai :
HegemanLab/w4mclassfilter documentation built on March 14, 2021, 1:19 a.m.