#' A function to filter dataset
#'
#' This function allows to filter the features. It first filter features with NA,
#' after it removes features with too many zeros and the ones with high correlation with others.
#' @import stats
#' @import caret
#' @param X is the dataset matrix with samples on rows and features on columns.
#' @param cor_th is the maximum accepted correlation between couple of features. Default value is 0.95
#' @param zero_th is the maximum percentage of zeros accepted in a feature. Default value is 0.8
#' @keywords filter
#' @export
#' @examples
#'
filter_dataset = function(X,cor_th=.95, zero_th=.8){
cat("Removing NA\n")
X <- X[, !apply(X, 2, function(x) any(is.na(x)) )] #Remove NA
cat("Removing column with unique values\n")
nValues = apply(X,2,FUN = function(col){length(unique(col))})
toRem = which(nValues == 1) #remove column that has the same value
if(length(toRem)>0) X = X[,-toRem]
toRem1 = which(colSums(X)==0)
if(length(toRem1)>0) X = X[,-toRem1]
toRem2 = which(apply(X,2,sd)==0)
if(length(toRem2)>0)X =X[,-toRem2]
library(caret)
cat("Removing column with more that zero_th% of the same value\n")
if(zero_th<1){
pZ = apply(X,2,FUN = function(col){
tab = table(col)
max(tab)/sum(tab)
}) #percentage of zero in each dataset
toRem = which(pZ>zero_th) #remove column with more than 80% of same objects
if(length(toRem)>0) X = X[,-toRem]
}
cat("Removing strongly correlated features\n")
if(cor_th<1){
correlationMatrix <- cor(X)
toRem <- findCorrelation(correlationMatrix, cutoff=cor_th,exact=FALSE)
if(length(toRem)>0) X = X[,-toRem]
}
return(X)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.