#' Removes features that have near zero variance
#'
#' This function removes features that are above a percentage threshold of features with near zero variance
#'
#' @param data_object argument is the output produced by as.ML function, which contains a single x data frame or a list of x data frames, a y data frames and attributes
#' @param threshold is the cutoff value for percentage of partitions containing features with near zero variance
#' @details
#'
#' @examples
#'dontrun{
#'}
#'
#'
#' @export
applyvarianceFilter = function(data_object, threshold = 0.1) {
# check that data_object inherits from 'as.MLinput'
# check that 'varianceFilt_info' attribute is not NULL in data_object
if (is.null(attr(data_object, "varianceFilt_info")))
stop("'varianceFilter' function has not been applied to data_object")
# first we'll extract attributes from 'data_object'
x_data = data_object$X
n_sources = attr(data_object, "n_sources")
partition_info = attr(data_object, "partition_info")
varFilt_info = attr(data_object, "varianceFilt_info")
# case where n_sources is 1
if (n_sources == 1) {
# case where dataPartitioning has not been applied
if (is.null(attr(data_object, "partition_info"))) {
# removing features with near zero variance from x_data
rm_inds = which(names(x_data) %in% varFilt_info)
if (length(rm_inds) > 0) {
new_xdata = x_data[, -rm_inds]
attr(new_xdata, "near_zero_var_removed") = varFilt_info
data_object$X = new_xdata
}
} else {
# case where dataPartitioning has been applied
# check that there are indeed features with near zero variance
if (inherits(varFilt_info, "table")) {
extra_partitions = attr(data_object, "extra_partitions")
folds = attr(data_object, "foldrep")$folds
reps = attr(data_object, "foldrep")$repeats
# apply replace_partitions
rp_result = replace_partitions(x_data = x_data, varFilt_info = varFilt_info, extra_partitions = extra_partitions,
folds = folds, reps = reps, thresh = threshold)
# update data_object
if (!is.null(rp_result$partitions_to_replace)) {
data_part_train = partition_info$train
data_part_test = partition_info$test
data_part_extra_train = extra_partitions$train
data_part_extra_test = extra_partitions$test
partitions_to_replace = rp_result$partitions_to_replace
data_part_train_new = replace(data_part_train, partitions_to_replace, data_part_extra_train[1:length(partitions_to_replace)])
data_part_test_new = replace(data_part_test, partitions_to_replace, data_part_extra_test[1:length(partitions_to_replace)])
rez = list(train = data_part_train_new, test = data_part_test_new)
attr(data_object, "partition_info") = rez
}
if (!is.null(rp_result$feat_to_remove)) {
data_object$X = x_data[, -which(names(x_data) %in% rp_result$feat_to_remove)]
}
# reset varFilt_info attr
attr(data_object, "varianceFilt_info") = NULL
# apply varianceFilter once again
data_object = varianceFilter(data_object)
varFilt_info = attr(data_object, "varianceFilt_info")
# check if there are any near zero var features and repeat process
if (inherits(varFilt_info, "table")) {
x_data = data_object$X
varFilt_info = attr(data_object, "varianceFilt_info")
partition_info = attr(data_object, "partition_info")
extra_partitions = attr(data_object, "extra_partitions")
rp_result2 = replace_partitions(x_data = x_data, varFilt_info = varFilt_info, extra_partitions = extra_partitions,
folds = folds, reps = reps, thresh = threshold)
# update data_object
if (!is.null(rp_result2$partitions_to_replace)) {
data_part_train = partition_info$train
data_part_test = partition_info$test
data_part_extra_train = extra_partitions$train
data_part_extra_test = extra_partitions$test
partitions_to_replace = rp_result2$partitions_to_replace
data_part_train_new = replace(data_part_train, partitions_to_replace, data_part_extra_train[1:length(partitions_to_replace)])
data_part_test_new = replace(data_part_test, partitions_to_replace, data_part_extra_test[1:length(partitions_to_replace)])
rez = list(train = data_part_train_new, test = data_part_test_new)
attr(data_object, "partition_info") = rez
}
if (!is.null(rp_result2$feat_to_remove)) {
data_object$X = x_data[, -which(names(x_data) %in% rp_result2$feat_to_remove)]
}
# reset varFilt_info attr
attr(data_object, "varianceFilt_info") = NULL
# apply varianceFilter once again
data_object = varianceFilter(data_object)
varFilt_info = attr(data_object, "varianceFilt_info")
if (inherits(varFilt_info, "table")) {
stop("threshold is too high, cannot generate sef of partitions where near zero variance criteria is met")
}
}
}
}
} else if (n_sources > 1) {
# case where dataPartitioning has not been applied
if (is.null(partition_info)) {
# remove near zero variance features from each data frame
new_xdata = mapply(function(x_df, vf) {
if (length(vf) > 0) {
inds_rm = which(names(x_df) %in% vf)
x_df = x_df[, -inds_rm]
} else {
x_df
}
attr(x_df, "near_zero_var_removed") = vf
return(x_df)
}, x_data, varFilt_info)
# update x_data in data_object
data_object$X = new_xdata
} else {
# case where dataPartitioning has been applied
# check that there are indeed x_data frames with near zero var features
if (any(varFilt_info != "none")) {
extra_partitions = attr(data_object, "extra_partitions")
folds = attr(data_object, "foldrep")$folds
reps = attr(data_object, "foldrep")$repeats
# apply replace_partitions
rp_result_list = mapply(replace_partitions, x_data, varFilt_info, MoreArgs = list(extra_partitions, folds, reps, threshold))
# lets update x_data with rp_result_list
new_xdata = mapply(function(x_df, rp_res) {
if (is.null(rp_res)) {
result = x_df
} else {
if (!is.null(rp_res$feat_to_remove)) {
result = x_df[, -which(names(x_df) %in% rp_res$feat_to_remove)]
attr(result, "feats_below_rm") = rp_res$feat_to_remove
} else {
result = x_df
}
}
return(result)
}, x_data, rp_result_list)
# now update x_data in data_object
data_object$X = new_xdata
# now lets gather the partitions to remove
all_partitions_to_replace = unique(unlist(lapply(rp_result_list, function(item) {
return(item$partitions_to_replace)
})))
if (!is.null(all_partitions_to_replace)) {
data_part_train = partition_info$train
data_part_test = partition_info$test
data_part_extra_train = extra_partitions$train
data_part_extra_test = extra_partitions$test
partitions_to_replace = all_partitions_to_replace
data_part_train_new = replace(data_part_train, partitions_to_replace, data_part_extra_train[1:length(partitions_to_replace)])
data_part_test_new = replace(data_part_test, partitions_to_replace, data_part_extra_test[1:length(partitions_to_replace)])
rez = list(train = data_part_train_new, test = data_part_test_new)
attr(data_object, "partition_info") = rez
}
# reset varFilt_info attr
attr(data_object, "varianceFilt_info") = NULL
# apply varianceFilter once again
data_object = varianceFilter(data_object)
varFilt_info = attr(data_object, "varianceFilt_info")
# check if there are any near zero var features and repeat process
if (any(varFilt_info != "none")) {
x_data = data_object$X
varFilt_info = attr(data_object, "varianceFilt_info")
partition_info = attr(data_object, "partition_info")
extra_partitions = attr(data_object, "extra_partitions")
# apply replace_partitions
rp_result_list2 = mapply(replace_partitions, x_data, varFilt_info, MoreArgs = list(extra_partitions, folds, reps,
threshold))
# lets update x_data with rp_result_list2
new_xdata2 = mapply(function(x_df, rp_res) {
if (is.null(rp_res)) {
result = x_df
} else {
if (!is.null(rp_res$feat_to_remove)) {
result = x_df[, -which(names(x_df) %in% rp_res$feat_to_remove)]
attr(result, "feats_below_rm") = rp_res$feat_to_remove
} else {
result = x_df
}
}
return(result)
}, x_data, rp_result_list2)
# now update x_data in data_object
data_object$X = new_xdata2
# now lets gather the partitions to remove
all_partitions_to_replace = unique(unlist(lapply(rp_result_list2, function(item) {
return(item$partitions_to_replace)
})))
if (!is.null(all_partitions_to_replace)) {
data_part_train = partition_info$train
data_part_test = partition_info$test
data_part_extra_train = extra_partitions$train
data_part_extra_test = extra_partitions$test
partitions_to_replace = all_partitions_to_replace
data_part_train_new = replace(data_part_train, partitions_to_replace, data_part_extra_train[1:length(partitions_to_replace)])
data_part_test_new = replace(data_part_test, partitions_to_replace, data_part_extra_test[1:length(partitions_to_replace)])
rez = list(train = data_part_train_new, test = data_part_test_new)
attr(data_object, "partition_info") = rez
}
# reset varFilt_info attr
attr(data_object, "varianceFilt_info") = NULL
# apply varianceFilter once again
data_object = varianceFilter(data_object)
varFilt_info = attr(data_object, "varianceFilt_info")
if (any(varFilt_info != "none")) {
stop("threshold is too high, cannot generate sef of partitions where near zero variance criteria is met")
}
}
}
}
}
# create applyvarianceFilter attr and set to TRUE
attr(data_object, "applyvarianceFilter") = TRUE
return(data_object)
}
# helper function that replaces near zero var features below 'threshold' argument
replace_partitions = function(x_data, varFilt_info, extra_partitions, folds, reps, thresh) {
if (!inherits(varFilt_info, "table")) {
final_result = NULL
} else {
col_sums = colSums(varFilt_info)
perc = sapply(col_sums, function(x) {
x/(folds * reps)
})
feat_above = names(perc)[which(perc > thresh)]
feat_below = names(perc)[which(perc < thresh)]
# remove near zero var features above threshold
if (length(feat_above) > 0) {
feat_to_remove = names(x_data)[which(names(x_data) %in% feat_above)]
} else {
feat_to_remove = NULL
}
if (length(feat_below) > 0) {
# extra partition info
data_part_extra_train = extra_partitions$train
# find all the partitions that contain the feature below the threshold
if (length(feat_below) == 1) {
partitions_to_replace = rownames(varFilt_info)[which(varFilt_info[, which(colnames(varFilt_info) %in% feat_below)] >
0)]
} else {
partitions_to_replace = rownames(varFilt_info)[which(rowSums(varFilt_info[, which(colnames(varFilt_info) %in% feat_below)]) >
0)]
}
if (length(partitions_to_replace) > length(data_part_extra_train))
stop("threshold is too high, there are more partitions to replace than replacements")
} else {
partitions_to_replace = NULL
}
final_result = list(partitions_to_replace = partitions_to_replace, feat_to_remove = feat_to_remove)
}
return(final_result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.