R/utils.R

Defines functions filterMissingnessRate imputeMatrix impute getMissingnessRateByFeatures computeMissingRate compare.matrix valid.matrix

Documented in compare.matrix computeMissingRate filterMissingnessRate getMissingnessRateByFeatures impute imputeMatrix valid.matrix

## Utils functions

#' valid.matrix
#'
#' Check if input is a valid matrix
#'
#' @param mx A numeric matrix as input.
#'
#' @return A matrix.
#' 
valid.matrix <- function(mx) {
  if(!is.matrix(mx)) {
    stop('Expected a matrix as input...')
  } else if (any(is.na(mx))) {
    stop('Matrix with some values missing...')
  } else if (length(rownames(mx)) == 0) {
    stop('Row names are missing...')
  } else if (length(colnames(mx)) == 0) {
    stop('Column names are missing...')
  } else {
    return(mx)
  }
}



#' compare.matrix 
#'
#' @param input.matrix A numeric matrix as input.
#' @param output.matrix A numeric matrix as input.
#' @param description Short description summarizing this action.
#'
#' @return A message object.
#'
compare.matrix <- function(input.matrix, output.matrix, description = 'Percent of removed features:') {

  if(!is.matrix(input.matrix) & !is.matrix(output.matrix)) {
    stop('Expected a matrix as input...')
  }

  ncol1 <- ncol(input.matrix)
  ncol2 <- ncol(output.matrix)

  if (ncol2 < 2){
    stop('Removed all features from input matrix...')
  } else {
    pct_removed = round(((ncol1 - ncol2)/ncol1*100), 2)
    message(paste0(
      'Kept ', ncol2, ' features out of ', ncol1, '. ',
       description, ' ', pct_removed, '%'))
  }
}


#' computeMissingRate
#'
#' @param v Input vector
#'
#' @return Missing rate value
#'
computeMissingRate <- function(v){
   v <- as.numeric(v)
   mr <- sum(is.na(v)) / length(v)
   return(round(mr, 2))
}



#' getMissingnessRateByFeatures
#'
#' Given an input matrix, compute the missingness rate for every feature (expected in cols)
#'
#' @param mx A numeric matrix as input.
#'
#' @return A dataframe with features and missingness rate.
#'
getMissingnessRateByFeatures <- function(mx){
  if(length(colnames(mx)) == 0) stop('Column (feature) names are missing...')
  missingness_rate <- apply(mx, 2, computeMissingRate)
  df <- data.frame(features=names(missingness_rate),
                   missingness=missingness_rate)
  return(df)
}


#' impute
#'
#' @param v Input numeric vector
#' @param method One of 'mean' of 'median'
#'
#' @return A vector with missing values imputed.
#'
impute <- function(v, method='mean'){
   if(!method %in% c('mean', 'median')){
     stop('Imputation method must be one of mean or median...')
     }
   v <- as.numeric(v)
   if(method=='mean'){
     v[is.na(v)] <- mean(v, na.rm = T)
   } 
   if(method=='median'){
     v[is.na(v)] <- median(v, na.rm = T)
   }
   return(v)
}


#' imputeMatrix
#' 
#' Impute missing values with the mean or median across samples (rows)
#' for every feature (cols).
#'
#' @param mx  A numeric matrix as input.
#' @param method Imputation method for missing values (mean or median).
#'
#' @return Imputed matrix
#' @export
#'
imputeMatrix <- function(mx, method='mean'){
    if(!is.matrix(mx)) {
      stop('Expected a matrix as input...')
    }
    if (length(rownames(mx)) == 0) {
      stop('Row names are missing...')
    }
    if (length(colnames(mx)) == 0) {
    stop('Column names are missing...')
    }
    rnames <- rownames(mx)
    message(paste0("Imputing matrix with method: ", method))
    mx.imputed <- apply(mx, 2, impute, method = method)
    row.names(mx.imputed) <- rnames
   return(mx.imputed)
}


#' filterMissingnessRate
#'
#' @param mx A numeric matrix as input.
#' @param max_missing_rate Maximal missing rate allow for a feature (default: 0.25).
#'
#' @return Filtered matrix with features (col) passing the max missing rate threshold removed.
#' @export
#'
filterMissingnessRate <- function(mx, max_missing_rate=0.25){
  if(!is.matrix(mx)) {
    stop('Expected a matrix as input...')
  }
  if (length(rownames(mx)) == 0) {
    stop('Row names are missing...')
  } 
  if (length(colnames(mx)) == 0) {
    stop('Column names are missing...')
  } 
  
  df = getMissingnessRateByFeatures(mx)
  
  features_to_keep = subset(df, df[,'missingness'] < 
                              max_missing_rate)[,'features']
  message(
    paste0("Found ", length(features_to_keep),
           ' features with max missing rate ', max_missing_rate))
  
  # keep features passing the missing rate cutoff
  mx.filtered = mx[,colnames(mx) %in% features_to_keep]
  
  compare.matrix(input.matrix = mx,
                 output.matrix = mx.filtered)
  
  return(mx.filtered)
}
enriquea/feseR documentation built on March 30, 2021, 4:12 p.m.