# install.packages('formatR') library(formatR)
# install.packages('gWidgetsRGtk2') tidy_app()
# formatR::tidy_app()
#' Title get_xy_from_DATA_C2
#'
#' @param DATA Full data matrix, includes all observations for all the variables
#' @param META_DATA Need to have at least 2 columns, one with all variables name, another one which indicate
#' the type of each variable (CM, DX, PB)
#'
#' @return a list of important variables
#'
#' @export
#'
#' @examples
#' # x <- get_xy_from_DATA_C2(DATA, META_DATA)[[1]]
#' # y <- get_xy_from_DATA_C2(DATA, META_DATA)[[2]]
get_xy_from_DATA_C2 <- function(DATA, META_DATA) {
# DATA META_DATA
x <- DATA[, META_DATA$varName[META_DATA$varCategory == "CM"]]
y <- DATA[, META_DATA$varName[META_DATA$varCategory == "DX"]]
list(x = x, y = y)
}
####################
# Feature Selection #
#' Title Features Selection
#'
#' @param x Data matrix
#' @param y Dependent variable
#' @param method The method to be used for the feature selection: Random forest, AIC, AIC with MSFDR or BIC
#' @param ... further arguments to be passed to or from other methods
#'
#' @return a list of important variables
#'
#' @export
#'
#' @examples
#' # feature_selection(x, y, method='RF')
#' # feature_selection(x[, 1:30], y, method='BIC')
#' # feature_selection(x, y, method='FDR_screening')
feature_selection <- function(x, y, method = "RF", ...) {
if (method == "RF") {
output <- Feature_Selection_dummy_regressions(x, y, Feature_Selection_RF,
...) # ('...' : p)
}
if (method == "AIC_MSFDR") {
output <- Feature_Selection_dummy_regressions(x, y, Feature_Selection_AIC_MSFDR,
...) # ('...' : q, print.the.steps)
}
if (method == "BIC") {
output <- Feature_Selection_dummy_regressions(x, y, Feature_Selection_BIC,
...) # ('...' : nbest, nvmax, nmin, plot)
}
if (method == "AIC") {
output <- Feature_Selection_dummy_regressions(x, y, Feature_Selection_AIC)
}
if (method == "FDR_screening") {
output <- Feature_Selection_dummy_regressions(x, y, FDR_selection,
...) # ('...' : q, eta)
}
if (method == "LASSO") {
output <- Feature_Selection_dummy_regressions(x, y, LASSO_selection)
}
return(output)
}
#' Finds a subset of variables based on all dummy regressions
#' Title Feature Selection Dummy Regression
#'
#' @param x Data matrix
#' @param y Dependent variable
#' @param FUN Indicating which method to use for feature selection
#' @param ... further arguments to be passed to or from other methods
#'
#' @return a vector with the names of the important variables
#' @export
#'
#' @examples
#' Feature_Selection_dummy_regressions(x, y, Feature_Selection_RF)
#'
Feature_Selection_dummy_regressions <- function(x, y, FUN, ...) {
u_y <- unique(y)
selected_variables <- list()
for (i in seq_along(u_y)) {
dummy_y <- as.numeric(y == u_y[i])
# FUN(x, y, ...)
selected_variables[[i]] <- FUN(x, dummy_y, ...)
}
# Union of all selected variables
unique(unlist(selected_variables))
}
###################################
# Feature Selection - sub-functions #
### Random Forests ###
#' Title Feature Selection Using Random Forest
#'
#' @param x Data matrix
#' @param y Categorial dependent variable (factor)
#' @param p Precentage of the number of variables to be chosen from x. Default value is 0.1.
#' @return list of p precentage of the variables chosen by their Gini importance index.
#'
#' @export
#'
#' @examples
#' # Feature_Selection_RF(x, y, p = 0.1)
#'
Feature_Selection_RF <- function(x, y, p = 0.1) {
library(randomForest)
if (!is.factor(y)) {
warning("y is not a factor - but was coerced into one.")
y <- as.factor(y)
}
rf_DX_by_CM <- randomForest(y ~ ., data = x, importance = TRUE, proximity = TRUE)
var_import <- importance(rf_DX_by_CM)[, "MeanDecreaseAccuracy"]
m <- round(dim(x)[2] * p) # We'll save just 10% of the variables, the precentage can be changed
subset_vars <- sort(var_import, decreasing = TRUE)[1:m] # Sort the variables by their Gini importance index
important_var_RF <- names(subset_vars)
return(unlist(important_var_RF))
}
### BIC ###
#' Title Feature Selection Using BIC
#'
#' @param x Data matrix
#' @param y response vector (must be numeric?)
#' @param nbest number of subsets of each size to record
#' @param nvmax maximum size of subsets to examine
#' @param nmin number of minimum varibles to be included in the suggested final model
#' @param plot.BIC if TRUE (default) the function plots a table of models showing which variables are in each model.
#' The models are ordered by the specified model selection statistic.
#' @return
#' vector with the names of variables of the model with minimum BIC between the models including more then 'nmin' variables' of regsubsets object
#' @export
#'
#' @examples
#' # Feature_Selection_BIC(x[, 1:30], y, nbest=1, nvmax=5, plot.BIC=TRUE, nmin=4)
Feature_Selection_BIC <- function(x, y, nbest = 1, nvmax = 12, nmin = 4,
plot.BIC = FALSE) {
library(leaps)
library(car)
fulldata <- data.frame(x, y) # Creating one joint data.frame of the data
RET <- regsubsets(y ~ ., data = fulldata, nbest = nbest, nvmax = nvmax,
really.big = TRUE)
# if (plot.BIC) { plot(RET, scale = 'bic') }
summary_RET <- summary(RET) # Saving the summary of the rugsubsets output
help_mat <- matrix(as.numeric(summary_RET$which), nrow = (nvmax * nbest),
ncol = (dim(x)[2] + 1)) # Which variables were chosen for each model
num_var_each_model <- apply(help_mat, 1, sum) # Counting the number of variables chosen for each model
chosen_models <- summary_RET$bic[which(num_var_each_model >= nmin)] # Saving the BIC value of the models which includes more then 'nmin' variables
ind_model_min_BIC <- which(chosen_models == min(chosen_models)) # Which model with more then 3 variables have the minimum BIC
return(unlist(colnames(x)[which(help_mat[ind_model_min_BIC, ] == 1) -
1]))
}
### AIC with FDR ###
#' Title Forward Selection Using AIC Criteria and MSFDR Procedure
#'
#' @param minimal.lm lm function output of model which includes an intercept
#' @param maximal.lm lm function output of model which not includes an intercept
#' @param q Significant level. default as 0.05
#' @param print.the.steps if TRUE the Lambda, model size, and final model at each iteration will be printed;
#' Default as FALSE
#' @param print.running.time If TRUE the running time will be printed, it is equal to the value of print.the.steps
#' Default as False.
#' @return
#' Final model, running time, summary of AIC_MSFDR object
#' @export
#'
#' @examples
#' # Feature_Selection_AIC_MSFDR(x, y, q = 0.5, print.the.steps = FALSE)
#'
MSFDR <- function(minimal.lm, maximal.lm, q, print.the.steps, print.running.time = print.the.steps) {
# computes forward model selection using the multiple stage FDR
# controlling procedure (MSFDR)
if (!(class(minimal.lm) == "lm" & class(maximal.lm) == "lm")) {
print("one of the models you entered aren't linear models (lm), please try fitting lm only")
break
}
if (print.running.time)
time <- proc.time()
library(MASS)
algorithm.direction <- "forward" # always forward
the.scope <- list(lower = minimal.lm, upper = maximal.lm)
trace.stepAIC <- ifelse(print.the.steps, 1, 0)
iteration.number <- 1
m <- extractAIC(maximal.lm)[1] - 1 # check if the full model should include the intercept or not !!!!!!
i <- max(extractAIC(minimal.lm)[1] - 1, 1) # so if the model is with intercept only, the i size won't be 0.
# q = .05 # default
Lambda <- qnorm((1 - 0.5 * q * i/(m + 1 - i * (1 - q))))^2
if (print.the.steps) {
print(paste("Starting Lambda is: ", Lambda))
}
# first step of the algorithm
new.lm <- stepAIC(minimal.lm, direction = algorithm.direction, scope = the.scope,
k = Lambda, trace = trace.stepAIC)
new.lm.model.size <- extractAIC(new.lm)[1] - 1
while (new.lm.model.size > i) {
iteration.number <- iteration.number + 1
if (print.the.steps) {
print("=========================================")
print("=========================================")
print(paste("iteration number: ", iteration.number))
print(paste("current model size is:", new.lm.model.size, ">",
i, " (which is bigger then the old model size)"))
}
i <- new.lm.model.size
Lambda <- qnorm((1 - 0.5 * q * i/(m + 1 - i * (1 - q))))^2
if (print.the.steps) {
print(paste("new Lambda is: ", Lambda))
}
new.lm <- stepAIC(new.lm, direction = algorithm.direction, scope = the.scope,
k = Lambda, trace = trace.stepAIC)
new.lm.model.size <- extractAIC(new.lm)[1] - 1
}
if (print.the.steps) {
print("=========================================")
print("=========================================")
print("=========================================")
print("The final model is: ")
print(new.lm$call)
}
if (print.running.time) {
print("")
print("Algorithm running time was:")
print(proc.time() - time)
}
return(new.lm)
}
# TODO: MSFDR does NOT (!!!) work with non-numeric values. Using it for
# factors, will produce very wrong results It should be considered if
# to extend it to also work with factors (e.g.: through multinomial
# regression)
Feature_Selection_AIC_MSFDR <- function(x, y, q = 0.05, print.the.steps = FALSE) {
y <- as.numeric(y)
fulldata <- data.frame(x, y = y)
# Creating one joint data.frame of the data defining the smallest and
# largest lm we wish to progress through
smallest_linear_model <- lm(y ~ +1, data = fulldata)
largest_linear_model <- lm(y ~ ., data = fulldata)
# Implementing the MSFDR functions (with q = 0.05)
AIC_MSDFR <- MSFDR(minimal.lm = smallest_linear_model, maximal.lm = largest_linear_model,
q, print.the.steps)
sum <- summary(AIC_MSDFR) # Saving the summary of the AIC.MSFDR procedure
important_var_FDR <- which(!is.na(AIC_MSDFR$coeff))
important_var_FDR <- names(important_var_FDR)
return(unlist(important_var_FDR[2:length(important_var_FDR)]))
}
### AIC without FDR ###
#' Title Feature Selection Using AIC
#'
#' @param x data matrix
#' @param y categorical variable (factor)
#'
#' @return
#' Returns a list with two items. The first is a list of important variables. The second
#' is NA if print.summary.AIC==FALSE or the summary of AIC if TRUE.
#' @export
#'
#' @examples
#' # Feature_Selection_AIC(x, y)
Feature_Selection_AIC <- function(x, y) {
library(MASS)
y <- as.numeric(y)
fulldata <- data.frame(x, y) # Creating one joint data.frame of the data
smallest_linear_model <- lm(y ~ +1, data = fulldata)
largest_linear_model <- lm(y ~ . + 1, data = fulldata)
AIC_procedure <- stepAIC(object = smallest_linear_model, scope = list(lower = smallest_linear_model,
upper = largest_linear_model), direction = "forward", trace = FALSE)
important_var_AIC <- names(AIC_procedure$coeff)
return(unlist(important_var_AIC[2:length(important_var_AIC)])) # Extracting the print of 'Intercept'
}
### FDR Selection (F and Chi-sq tests)
#' Title Feature Selection Using FDR selection
#'
#' @param x data matrix
#' @param y categorical variable (factor)
#' @param q adjusted p value threshold level. The chosen variables will have adjusted p value smaller than q
#' @param eta eta squared threshold, the chosen variables will have eta value greater then eta.
#'
#' @return
#' Returns a list of the selected variables
#' @export
#'
#' @examples
#' # FDR_selection(x, y, q = 0.001, eta = 0.1)
FDR_selection <- function(x, y, q = 0.05, eta = 0.1) {
if (!is.factor(y)) {
warning("y is not a factor - but was coerced into one.")
y <- as.factor(y)
}
eta_squared <- rep(NA, dim(x)[2])
original_p_val <- rep(NA, dim(x)[2])
for (i in 1:dim(x)[2]) {
# variable is discrete
if (sum(floor(x[, i]) == x[, i]) == dim(x)[2])
{
original_p_val[i] <- chisq.test(x = x[, i], y)$p.value
eta_squared[i] <- summary.lm(lm(as.factor(x[, i]) ~ as.factor(y)))$r.squared
} # variable is not discrete
else {
anova_model <- anova(lm(x[, i] ~ y + 0))
original_p_val[i] <- anova_model[[5]][1]
eta_squared[i] <- summary.lm(lm(x[, i] ~ as.factor(y)))$r.squared
}
}
names(original_p_val) <- colnames(x)
adjust_p_val <- p.adjust(original_p_val, method = "BH")
is_smaller <- ifelse(adjust_p_val < q & eta_squared > eta, 1, 0)
screening <- data.frame("var" = names(original_p_val), original_p_val, adjust_p_val,
eta_squared, is_smaller, row.names = c(1:length(original_p_val)))
keep_vars <- screening$var[which(is_smaller == 1)]
screening <- screening[order(original_p_val), ]
return(as.character(keep_vars))
}
#' Title LASSO
#'
#' @param x Data matrix
#' @param y Dependent variable
#'
#' @return
#' plot and table which advises how many clusters should be
#'
#' @export
#'
#' @examples
#' # LASSO_selection(x, y)
# LASSO_selection<-function(x, y) { cvfit <- cv.glmnet(as.matrix(x), y)
# important_var_LASSO <- as.matrix(coef(cvfit, s = 'lambda.1se'))
# important_var_LASSO <- important_var_LASSO[important_var_LASSO[, 1]
# != 0, ] important_var_LASSO <-
# important_var_LASSO[names(important_var_LASSO) != '(Intercept)']
# reduced_x <- x[, names(important_var_LASSO)] return(reduced_x) }
#########################################################
# Deciding on number of clusters and clustering the data #
#' Title Deciding on Number of Clusters
#'
#' @param x Data matrix
#' @param method character string indicating how the “optimal” number of clusters: Euclidean (default), Manhattan,
#' heirarchical euclidean or heirarchcal manhattan
#' @param K.max the maximum number of clusters to consider, must be at least two. Default value is 10.
#' @param B integer, number of Monte Carlo (“bootstrap”) samples. Default value is 100.
#' @param verbose integer or logical, determining if “progress” output should be printed. The default prints
#' one bit per bootstrap sample. Default value is FALSE.
#' @param scale if TRUE (default) the data matrix will be scaled.
#' @param diss if TRUE (default as FALSE) x will be considered as a dissimilarity matrix.
#' @param cluster.only if true (default as FALSE) only the clustering will be computed and returned, see details.
#' @param plot.num.clus if TRUE (default) the gap statistic plot will be printed
#'
#' @return
#' plot and table which advises how many clusters should be
#'
#' @export
#'
#' @examples
#' # number_of_clusters(subx, B=50, method='Euclidean')
#'
number_of_clusters <- function(x, method = "Euclidean", K.max = 10, B = 100,
verbose = FALSE, plot.num.clus = TRUE, scale = TRUE, diss = FALSE,
cluster.only = TRUE) {
# scale
if (scale) {
x <- scale(x)
}
# TODO: what we SHOULD do is pass Euclidean/Man to the functions, as
# well as hclust vs pam...
if (method == "Euclidean") {
k_clusters <- k_euclidean(x, K.max, B, verbose, plot.num.clus)
}
if (method == "Manhattan") {
k_clusters <- k_manhattan(x, K.max, diss, B, cluster.only, verbose,
plot.num.clus)
}
if (method == "hclust_Euclidean") {
k_clusters <- khclust_euc(x, K.max, B, verbose, plot.num.clus)
}
if (method == "hclust_Manhattan") {
k_clusters <- khclust_man(x, K.max, B, verbose, plot.num.clus)
}
return(list(k_clusters))
}
#' Title Gap statisic with k-medoids euclidean
#'
#' @param x Data matrix
#' @param K.max the maximum number of clusters to consider, must be at least two. Default value is 10.
#' @param B integer, number of Monte Carlo (“bootstrap”) samples. Default value is 100.
#' @param verbose integer or logical, determining if “progress” output should be printed. The default prints
#' one bit per bootstrap sample. Default value is FALSE.
#' @param plot.num.clus if TRUE (default) the gap statistic plot will be printed
#'
#' @return the clusGap function' values
#' @export
#'
#' @examples
#' # k_euclidean(subx, K.max=8, B=50, verbose=FALSE, plot.num.clus=TRUE)
#'
k_euclidean <- function(x, K.max, B, verbose, plot.num.clus) {
library(cluster)
library(clusterCrit)
clusGap_best <- cluster::clusGap(x, FUN = pam, K.max = K.max, B, verbose)
if (plot.num.clus) {
plot(clusGap_best, main = "Gap Statistic for k-medoids Euclidean")
}
# # Silhouette Criteria for k-medoids sil <- c(rep(NA, 10)) sil[1] <- 0
# max_sil <- 0 clust_num_sil <- 0 for (i in 2:10) { clust <- pam(x, i,
# diss = FALSE) sil[i] <- intCriteria(x, clust$cluster, 'Silhouette')
# if (as.numeric(sil[i]) > max_sil) { max_sil_means <- sil[i]
# clust_num_sil <- i } } if (plot.num.clus) { plot(as.numeric(sil),
# type = 'l', main = 'Silhouette criteria k-medoids Euclidean') }
# return(list(clusGap_best, clust))
return(list(clusGap_best))
}
#' Title Gap statisic with k-medoids manhattan
#'
#' @param x data matrix
#' @param K.max positive integer specifying the number of clusters, less than the number of observations.
#' Default value is 10.
#' @param diss if TRUE (default as FALSE) x will be considered as a dissimilarity matrix
#' @param B integer, number of Monte Carlo (“bootstrap”) samples. Default value is 100.
#' @param cluster.only if true (default) only the clustering will be computed and returned, see details.
#' @param verbose integer or logical, determining if “progress” output should be printed. The default prints
#' one bit per bootstrap sample. Default as FALSE.
#' @param plot.num.clus if TRUE (default) the gap statistic plot will be printed
#' @param ... another objects of pam function
#'
#' @return clusGap function' output
#' @export
#'
#' @examples
#' # k_manhattan (subx, K.max = 8, diss=FALSE, B = 50, cluster.only = TRUE, verbose = FALSE)
#'
k_manhattan <- function(x, K.max, diss, B, cluster.only, verbose, plot.num.clus) {
library(cluster)
library(clusterCrit)
library(magrittr)
library(fpc)
pam_1 <- function(x, k, ...) {
clusters <- x %>% pam(k = k, diss = diss, metric = "manhattan",
cluster.only = cluster.only)
list(clusters = clusters)
}
set.seed(40)
clusGap_best <- clusGap(x, FUN = pam_1, K.max = K.max, B = B, verbose = verbose)
if (plot.num.clus) {
plot(clusGap_best, main = "Gap Statistic for k-medoids Manhattan")
}
# #Silhouette criteria with k-medoids manhattan
# sil_med_m<-c(rep(NA,10)) sil_med_m[1]<-0 max_sil_med_m<-0
# clust_num_sil_med_m<-0 for (i in 2:10) {
# clust_med_m<-pam(Scaled_Reduced_CM_trans,i,diss=FALSE,metric='manhattan')
# sil_med_m[i]<-intCriteria(Scaled_Reduced_CM_trans,clust_med_m$cluster,'Silhouette')
# if (as.numeric(sil_med_m[i]) > max_sil_med_m) {
# max_sil_med_m<-sil_med_m[i] clust_num_sil_med_m<-i } }
# plot(as.numeric(sil_med_m),type='l',main='Silhouette criteria,
# k-medoids manhattan')
return(list(clusGap_best))
}
#' Title Gap statistics for hclust Euclidean
#'
#' @param x data matrix
#' @param K.max positive integer specifying the number of clusters, less than the number of observations.
#' @param B integer, number of Monte Carlo (“bootstrap”) samples
#' @param verbose integer or logical, determining if “progress” output should be printed. The default prints
#' one bit per bootstrap sample
#' @param plot.num.clus if TRUE (default) the gap statistic plot will be printed
#'
#' @return the clusGap function output
#' @export
#'
#' @examples
#' # khclust_euc(subx,K.max=10, B=60, verbose = FALSE, plot.num.clus=TRUE )
#'
khclust_euc <- function(x, K.max, B, verbose, plot.num.clus) {
hclust_k_euc <- function(x, k, ...) {
library(magrittr)
library(cluster)
clusters <- x %>% dist %>% hclust %>% cutree(k = k)
list(clusters = clusters)
}
clusGap_best <- clusGap(x, FUN = hclust_k_euc, K.max = K.max, B = B,
verbose = verbose)
if (plot.num.clus) {
plot(clusGap_best, main = "Gap statistic, hclust Euclidean")
}
return(clusGap_best)
}
#' Title Gap statistics for hclust Manhattan
#'
#' @param x data matrix
#' @param K.max positive integer specifying the number of clusters, less than the number of observations.
#' Default value is 10
#' @param B integer, number of Monte Carlo (“bootstrap”) samples. Default value is 100.
#' @param verbose integer or logical, determining if “progress” output should be printed. The default prints
#' one bit per bootstrap sample. Default value is FALSE.
#' @param plot.num.clus if TRUE (default) the gap statistic plot will be printed
#'
#' @return the clusGap function output
#' @export
#'
#' @examples
#' # khclust_man(subx, K.max=8, B=60, verbose=FALSE, plot.num.clus=TRUE)
#'
khclust_man <- function(x, K.max, B, verbose, plot.num.clus) {
hclust_k_man <- function(x, k, ...) {
library(magrittr)
clusters <- x %>% dist(method = "manhattan") %>% hclust %>% cutree(k = k)
list(clusters = clusters)
}
clusGap_best <- clusGap(x, FUN = hclust_k_man, K.max = K.max, B = B,
verbose = verbose)
if (plot.num.clus) {
plot(clusGap_best, main = "Gap statistic, hclust Manhattan")
}
return(list(clusGap_best))
}
#####################
# Clustering the data #
#' Title Clustering
#'
#' @param x data matrix
#' @param k.gap positive integer specifying the number of clusters, less than the number of observation. Default value is 10.
#' @param method Indicating which method to use for clustering. Default is 'Euclidean'.
#' @param plot.clustering if TRUE (default) a 2-dimensional “clusplot” plot will be printed
#'
#' @return vector withnew assigned clusters
#' @export
#'
#' @examples
#' clustering(subx, k.gap = 5, method='Euclidean', plot.clustering=TRUE)
#'
clustering <- function(x, k.gap = 2, method = "Euclidean", plot.clustering = FALSE) {
if (method == "Euclidean") {
clusters <- cluster_euclidean(x, k.gap, plot.clustering)
}
if (method == "Manhattan") {
clusters <- cluster_manhattan(x, k.gap, plot.clustering)
}
if (method == "Heuclidean") {
clusters <- cluster_euclidean(x, k.gap, plot.clustering)
}
if (method == "Hmanhattan") {
clusters <- cluster_manhattan(x, k.gap, plot.clustering)
}
return(clusters)
}
### Euclidean ###
#' Title Clustering Using Euclidean distances
#'
#' @param x data matrix
#' @param k.gap positive integer specifying the number of clusters, less than the number of observation. Default value is 10.
#' @param plot.clustering if TRUE (default) a 2-dimensional “clusplot” plot will be printed
#'
#' @return
#' vector with the new assigned clusters
#'
#' @export
#'
#' @examples
#' # cluster_euclidean(subx, k.gap = 5, plot.clustering = TRUE)
#'
cluster_euclidean <- function(x, k.gap, plot.clustering) {
library(cluster)
pam_4 <- pam(x, k.gap, diss = FALSE)
if (plot.clustering) {
clusplot(x, pam_4$cluster, color = TRUE, main = c("k-medoids,",
paste = k.gap, "clusters"))
}
clusters <- pam_4$cluster
return(unlist(clusters))
}
### Manhattan ###
#' Title Clustering Using Manhattan Distances
#'
#' @param x data matrix
#' @param k.gap positive integer specifying the number of clusters, less than the number of observation. Default value is 10.
#' @param plot.clustering if TRUE (default) a 2-dimensional “clusplot” plot will be printed
#'
#' @return
#' vector with the new assigned clusters
#' @export
#'
#' @examples
#' # cluster_manhattan(subx, k.gap=4, plot.clustering=TRUE)
#'
cluster_manhattan <- function(x, k.gap, plot.clustering) {
pam_3_man <- pam(x, k.gap, diss = FALSE, metric = "manhattan")
if (plot.clustering) {
clusplot(x, pam_3_man$cluster, color = TRUE, main = c("k-medoids,manhattan",
paste(k.gap), "clusters"))
}
clusters <- pam_3_man$cluster
return(unlist(clusters))
}
### Hierarchical clustering euclidean ###
#' Title Deciding on number of clusters by using Hierarchical clustering euclidean
#'
#' @param x data matrix
#' @param y Dependent variable
#' @param k.gap positive integer specifying the number of clusters, less than the number of observation. Default value is 10.
#' @param plot.clustering if TRUE (default) a 2-dimensional “clusplot” plot will be printed
#'
#'
#' @return
#' summary table of the distribution to clusters
#' @export
#'
#' @examples
#' hclust_euc(subx, k.gap = 5, plot.clustering=TRUE)
#'
hclust_euc <- function(x, k.gap, plot.clustering) {
d <- dist(x, method = "euclidean")
fit_best <- hclust(d, method = "ward.D")
if (plot.clustering) {
plot(fit_best, main = c("hclust , euclidean,", paste(k.gap), " clusters"))
}
groups_best_4 <- cutree(fit_best, k = k.gap)
rect.hclust(fit_best, k = k.gap, border = "blue")
clusters <- groups_best_4
return(unlist(clusters))
}
### Hierarchical clustering manhattan ###
#' Title Deciding on number of clusters by Hierarchical clustering manhattan
#'
#' @param x data matrix
#' @param plot.clustering if TRUE (default) a 2-dimensional 'clusplot' plot will be printed
#'
#' @return
#' a list of two variables the hclust function description and a summary table
#' of the distribution to clusters
#' @export
#'
#' @examples
#' hclust_man(subx, k.gap = 5, plot.clustering=TRUE)
#'
hclust_man <- function(x, k.gap, plot.clustering) {
d_man <- dist(x, method = "manhattan")
fit_best_man <- hclust(d_man, method = "ward.D")
if (plot.clustering) {
plot(fit_best_man, main = c("hclust, manhattan,", paste(k.gap),
"7 clusters"))
}
groups_best_4_man <- cutree(fit_best_man, k = k.gap)
rect.hclust(fit_best_man, k = k.gap, border = "red")
clusters <- groups_best_4_man
return(unlist(clusters))
}
###############
# 3 C functions #
#' Title C2
#'
#' @param x data matrix
#' @param y Dependent variable
#' @param feature_selection_method method for the feature selection of the clinical measurements stage. Default RF.
#' @param num_clusters_method method for the choosing number of clusters by using the clinical measurements. Default Euclidean.
#' @param k number of clusters to use. If missing, we use a detection method. Defaukt as NULL
#' @param clustering_method method for clustering using the reduced clinical measures. Default is Hmanhattan,
#'
#' @return a list of three variables:
#' 1) vector with the names of the omportant variables chosen.
#' 2) number of classes that will be used for clustering
#' 3) vector of the new assigned clusterst
#'
#' @export
#'
#' @examples
#' resultC2 <- C2(x, y, feature_selection_method='RF', num_clusters_method='Manhattan', clustering_method='Manhattan', plot.num.clus=TRUE, plot.clustering=TRUE)
#' C2(x, y, feature_selection_method='BIC', num_clusters_method='Manhattan', clustering_method='Hmanhattan', plot.num.clus=TRUE, plot.clustering=FALSE, nbest=1, nvmax=8, B=50)
C2 <- function(x, y, feature_selection_method, num_clusters_method, k = NULL,
clustering_method, ...) {
# Feature selection
imp_var <- feature_selection(x, y, method = feature_selection_method)
# print(imp_var) CM_final_vars <- imp_var[[1]][2] # Extracting a list
# of inportant CM variables
subx <- x[, unlist(imp_var)]
# Deciding on number of clusters
if (missing(k)) {
num_clust <- number_of_clusters(x = subx, method = num_clusters_method)
print(num_clust)
# library(car)
user_choise <- function() {
k <- readline(prompt = paste("Enter the chosen number of clusters",
":\n"))
k <- as.numeric(k)
return(k)
}
num_clust <- user_choise()
} else {
num_clust <- k
}
# Final clustering
final_cluster <- clustering(subx, k.gap = num_clust)
# print(final_cluster)
return(list(imp_var, num_clust, final_cluster))
}
#' Title get_PBx_from_DATA_C3
#'
#' @param DATA Full data matrix, includes all observations for all the variables
#' @param META_DATA Need to have at least 2 columns, one with all variables name, another one which indicate
#' the type of each variable (CM, DX, PB)
#'
#' @return a list of important variables
#'
#' @export
#'
#' @examples
#' # PBx <- get_PBx_from_DATA_C3(DATA, META_DATA)
#'
get_PBx_from_DATA_C3 <- function(DATA, META_DATA) {
x <- DATA[, META_DATA$varName[META_DATA$varCategory == "PB"]]
return(PBx = x)
}
#' Title C3
#'
#' @param PBx data matrix
#' @param newy new assigned clusters, results from C2.
#' @param feature_selection_method method for the feature selection of the Potential Bio-Markers
#' @param classification_method method for classification using the potential bio-markers
#'
#' @return a list of two variables:
#' 1) vector with the names of important variables chosen
#' 2) classification result for each observation
#' @export
#'
#' @examples
#' C3(PBx, newy, feature_selection_method='RF', classification_method='RF')
#'
C3 <- function(PBx, newy, feature_selection_method, classification_method) {
# Feature selection if(!factor(newy)){ newy <- as.factor(newy) }
imp_var <- feature_selection(PBx, newy, method = feature_selection_method)
sub_PBx <- PBx[, imp_var]
# Classification
classification <- classification_fun(PBx, newy, method = classification_method)
return(list(imp_var, unname(classification)))
}
####################################### Potential biomarkers classification #
#' Title Classification for the potential Biomarkers
#'
#' @param PBx data matrix
#' @param newy New assigned clusters
#' @param method Classification method for the function to use
#'
#' @return Predicted values for each observation
#'
#' @export
#'
#' @examples
#' # classification_fun(PBx, newy, method='RF')
classification_fun <- function(PBx, newy, method = "RF") {
if (method == "RF") {
output <- RF_classify(PBx, newy)
}
if (method == "RF_downsampling") {
output <- RF_one_by_one(PBx, newy)
}
if (method == "CART_information") {
output <- cart_function(PBx, newy, criteria = "information")
}
if (method == "CART_gini") {
output <- cart_function(PBx, newy, criteria = "gini")
}
return(output)
}
### Random Forest Without Down Sampling ###
#' Title Classification Using Random Forest Without Down Sampling
#'
#' @param PBx data matrix
#' @param newy New assigned clusters
#'
#' @return The predicted values for each observation
#'
#' @export
#'
#' @examples
#' # RF_classify(PBx, newy)
library(randomForest)
RF_classify <- function(PBx, newy) {
if (!is.factor(newy)) {
warning("y is not a factor - but was coerced into one.")
newy <- as.factor(newy)
}
fulldata <- data.frame(PBx, newy)
rf_clus_PB <- randomForest(newy ~ ., data = fulldata, ntree = 50)
model <<- rf_clus_PB
return(rf_clus_PB$predicted)
}
### Random forest with down sampling ###
#' Title Classification Using Random Forest Without Down Sampling
#'
#' @param PBx data matrix
#' @param newy New assigned clusters
#'
#' @return a list of two variables: the hclust function description and a summary table
#' of the distribution to clusters
#' @export
#'
#' @examples
#' # RF_one_by_one(PBx, newy)
RF_one_by_one <- function(PBx, newy) {
if (!is.factor(newy)) {
warning("y is not a factor - but was coerced into one.")
newy <- as.numeric(as.factor(newy))
}
rflist_names <- paste("cluster", c(1:length(unique(newy))))
rflist <- sapply(rflist_names, function(x) NULL)
for (i in 1:length(unique(newy))) {
class_2 <- ifelse(newy == i, 1, 0)
nmin <- sum(class_2 == 1)
rflist[[i]] <- randomForest(factor(class_2) ~ ., data = PBx, ntree = 1000,
importance = TRUE, proximity = TRUE, sampsize = rep(nmin, 2))
}
return(rflist)
}
### CART ###
#' Title Classification Using CART
#'
#' @param PBx data matrix
#' @param newy New assigned clusters
#' @param criteria gini or information
#'
#' @return a list of two variables: the hclust function description and a summary table
#' of the distribution to clusters
#' @export
#'
#' @examples
#' # cart_function(PBx, newy, 'information')
cart_function <- function(PBx, newy, criteria = "gini") {
fulldata <- data.frame(PBx, newy)
cart <- rpart(newy ~ ., data = fulldata, method = "class", parms = list(split = criteria))
model <<- cart
pred <- predict(cart, type = "class")
return(pred)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.