R/filter_dataset.R

Defines functions filter_dataset

Documented in filter_dataset

#' A function to filter dataset
#'
#' This function allows to filter the features. It first filter features with NA,
#' after it removes features with too many zeros and the ones with high correlation with others.
#' @import stats
#' @import caret
#' @param X is the dataset matrix with samples on rows and features on columns.
#' @param cor_th is the maximum accepted correlation between couple of features. Default value is 0.95
#' @param zero_th is the maximum percentage of zeros accepted in a feature. Default value is 0.8
#' @keywords filter
#' @export
#' @examples
#'

filter_dataset = function(X,cor_th=.95, zero_th=.8){

  cat("Removing NA\n")
  X <- X[, !apply(X, 2, function(x) any(is.na(x)) )] #Remove NA

  cat("Removing column with unique values\n")

  nValues = apply(X,2,FUN = function(col){length(unique(col))})
  toRem = which(nValues == 1) #remove column that has the same value
  if(length(toRem)>0) X = X[,-toRem]


  toRem1 = which(colSums(X)==0)
  if(length(toRem1)>0) X = X[,-toRem1]
  toRem2 = which(apply(X,2,sd)==0)
  if(length(toRem2)>0)X =X[,-toRem2]

  library(caret)
  cat("Removing column with more that zero_th% of the same value\n")

  if(zero_th<1){
    pZ = apply(X,2,FUN = function(col){
      tab = table(col)
      max(tab)/sum(tab)
    }) #percentage of zero in each dataset
    toRem = which(pZ>zero_th) #remove column with more than 80% of same objects
    if(length(toRem)>0) X = X[,-toRem]
  }

  cat("Removing strongly correlated features\n")

  if(cor_th<1){
    correlationMatrix <- cor(X)
    toRem <- findCorrelation(correlationMatrix, cutoff=cor_th,exact=FALSE)
    if(length(toRem)>0) X = X[,-toRem]
  }


  return(X)
}
angy89/hyQSAR documentation built on Sept. 24, 2019, 7:31 a.m.