#' prelim.ML
#' @description makes some preliminary steps useful for Machine Learning (A PCA and a correlation analysis)
#'
#' @param df data frame for analysis
#' @param ncp number of dimensions kept in the results (by default 5) of PCA analysis
#' @param dim.reduction.factor Correlation threshold over which correlated variables will be dropped
#' @param save.to when not NULL, folder path where result files will be written
#' @param suffix when save.to is not NULL, it adds a suffix to file names
#'
#' @return list with max four objects: PCA and correlation of variables, and if variable reduction is possible, PCA and correlation of reduced variables.
#'
#' @export
#'
prelim_ML <- function(df, ncp = 5, dim.reduction.factor = 0.7, save.to = NULL, suffix = ""){
prelim.ML.obj <- list()
if(!is.null(save.to)) mkdir(save.to)
pca.summary.file <- ifelse(is.null(save.to),
"",
file.path(save.to, paste("PCA_summary", suffix , "txt", sep = ".") ) )
pca.red.summary.file <- ifelse(is.null(save.to),
"",
file.path(save.to, paste("PCA_summary_reduced", suffix , "txt", sep = ".") ) )
# Only numeric columns can be used in PCA and correlation
df.num <- df[, sapply(df, is.numeric)]
# Do a Principal Component Analysis
pca.df <- FactoMineR::PCA(df.num, scale.unit=TRUE, ncp=ncp, graph = F)
prelim.ML.obj$pca <- pca.df
summary(pca.df, nbelements = Inf , nbind = Inf, file = pca.summary.file)
#dimdesc(pca.df)
if(!is.null(save.to)){
pdf(file.path(save.to, paste("PCA_graphs", suffix, "pdf", sep=".")), width = 10, height = 10)
devNum = dev.cur()
}
plot(pca.df, cex=0.6, choix = "ind", unselect = 0.7, label="ind.sup" )
plot(pca.df, cex=0.6, choix = "var")
if(!is.null(save.to)){
dev.off(devNum)
}
#Calculate variable correlations
corr.df <- cor(df.num)
prelim.ML.obj$correlation <- corr.df
tl.cex <- ifelse(ncol(df.num) > 30, 0.6, 1.0)
if(!is.null(save.to)){
pdf(file.path(save.to, paste("Correlation_graphs", suffix, "pdf", sep=".")), width = 10, height = 10)
devNum = dev.cur()
}
corrplot::corrplot(corr.df, order="hclust", tl.cex = tl.cex, method = "ellipse", diag = F, tl.col = "darkblue")
if(!is.null(save.to)){
dev.off(devNum)
}
# Perform a similar analysis with reduced number of variables
highlyCor <- caret::findCorrelation(corr.df, dim.reduction.factor)
if(length(highlyCor) > 0){
prelim.ML.obj$highly.correlated.vars <- names(df.num[, highlyCor])
df.red <- df.num[,-highlyCor]
corr.df.red <- cor(df.red)
prelim.ML.obj$correlation.reduced <- corr.df.red
if(!is.null(save.to)){
pdf(file.path(save.to, paste("Correlation_Reduced_graphs", suffix, "pdf", sep=".")), width = 10, height = 10)
devNum = dev.cur()
}
tl.cex.red <- ifelse(ncol(df.red) > 30, 0.6, 1.0)
corrplot::corrplot(corr.df.red, order = "hclust", tl.cex = tl.cex.red, method = "ellipse", diag = F, tl.col = "darkblue")
if(!is.null(save.to)){
dev.off(devNum)
}
pca.df.red <- FactoMineR::PCA(df.red, scale.unit = T, ncp=5, graph = F)
prelim.ML.obj$pca.reduced <- pca.df.red
summary(pca.df.red, nbelements = Inf , nbind = Inf, file = pca.red.summary.file)
if(!is.null(save.to)){
pdf(file.path(save.to, paste("PCA_graphs_red", suffix, "pdf", sep=".")), width = 10, height = 10)
devNum = dev.cur()
}
plot(pca.df.red, cex=0.6, choix = "ind", unselect = 0.7, label="ind.sup" )
plot(pca.df.red, cex=0.6, choix = "var")
if(!is.null(save.to)){
dev.off(devNum)
}
}
return(prelim.ML.obj)
}
#' get.training.sample
#'
#' @description separates a data frame into two data frames for training and testing in ML classification.
#'
#' @param df data frame containing a whole data set.
#' @param class_category boolean column name used for classification.
#' @param factor.training percentage of data used for training. It takes the portion factor.training from the smallest class and a equivalent number of elements from the biggest class.
#' @param grep_rowname string containing a regular expression that limits the elements used for the training by choosing matching elements at rowname.
#'
#' @return list with two data frames: trainset and testset
#'
#' @export
#'
get_training_sample <- function(df, class_category, factor.training, grep_rowname=NULL){
# Sampling for training
df[, class_category] = as.logical(df[, class_category])
df_discardedForTraining <- NULL
if(!is.null(grep_rowname)){
df_discardedForTraining <- df[grep(grep_rowname, rownames(df), invert = T) , ]
df <- df[grep(grep_rowname, rownames(df)) , ]
}
df.class.true <- df[df[,class_category] == T,] # %>% filter_(class_category == T)
df.class.false<- df[df[,class_category] == F,] # %>% filter_(class_category == F)
#set the size of the training depending on the smallest dataset
trainset.size <- round(min(nrow(df.class.true), nrow(df.class.false)) * factor.training, 0)
trainset.index.true <- sample(1:nrow(df.class.true), size = trainset.size, replace = F)
trainset.index.false <- sample(1:nrow(df.class.false), size = trainset.size, replace = F)
trainset <- rbind(df.class.true[ trainset.index.true, ], df.class.false[ trainset.index.false, ])
testset <- rbind(df.class.true[-trainset.index.true, ], df.class.false[-trainset.index.false,])
if(!is.null(df_discardedForTraining)) testset <- rbind(testset, df_discardedForTraining)
train.test.sets <- list(trainset = trainset, testset = testset)
return(train.test.sets)
}
#' contingencyTable
#' @description calculates the contingency table, given test and predicted values, and a threshold for the predicted values.
#'
#' @param predicted vector with predicted values (if it is a score, a threshold should be given)
#' @param test_var vector with classification
#'
#' @return data frame with the four values of the contingency table
#'
#' @export
#'
contingencyTable <- function(predicted, test_var, threshold = 0.6){
predicted_boolean <- ifelse(predicted > threshold, TRUE, FALSE)
ct <- data.frame(
TruePos = (test_var == predicted_boolean & test_var == TRUE),
TrueNeg = (test_var == predicted_boolean & test_var == FALSE),
FalsePos = (!(test_var == predicted_boolean) & test_var == FALSE),
FalseNeg = (!(test_var == predicted_boolean) & test_var == TRUE)
) %>% summarise_all(sum)
return(ct)
}
#WARNING: not finished yet!!
#' retrieve.data.modeling
#'
#' @description retrieves data for modeling from a defined data structure
#'
#' @param cell.line cell line from which data should be retrieved
#' @param is_perturbation
retrieve_data_modeling <- function(cell.line, is_perturbation, ML_method){
# Define file and folder names
experim.folder <- ifelse(is_perturbation, "perturbations", "cell.lines")
modeling.folder <- file.path(E2Predictor.Config$working.path, "tables.for.modeling", experim.folder, cell.line)
ML.folder <- file.path(E2Predictor.Config$working.path, "data_analyses", "machine_learning", ML_method, experim.folder, cell.line)
if(is_perturbation){
modeling.folder <- file.path(modeling.folder, paste(condition, addgrepcond, sep = "_") )
ML.folder <- file.path(ML.folder, paste(condition, addgrepcond, sep = "_") )
}
mkdir(ML.folder)
modeling.file <- file.path(modeling.folder, "omics_table_for_modeling.RData")
# load data for modeling
return(readRDS(modeling.file))
}
#' prepare_data_modeling
#' @description fix some variable names (from gene ontology), filter variables, and normalize data
#'
#' @param df data frame containing data to be modeled
#' @param useVariables a vector containing variable names to be used. When NULL, all variables are used.
#'
#' @export
#'
prepare_data_modeling <- function(df, useVariables=NULL){
omics.model <- df %>% distinct(key, protein_entry, .keep_all=TRUE)
# Combine in rownames: key and protein name.
omics.key_protein <- paste(omics.model$key, omics.model$protein_entry, sep="#")
if(!is.null(useVariables)) omics.model <- omics.model[, useVariables]
#rename variables for the model
renaming.vars <- stringr::str_extract(names(omics.model), "\\[GO:.*\\]")
renaming.vars <- gsub("\\[", "", renaming.vars)
renaming.vars <- gsub("\\]", "", renaming.vars)
renaming.vars <- gsub(":", "", renaming.vars)
renaming.vars.index <- which(!is.na(renaming.vars))
renaming.vars <-renaming.vars[renaming.vars.index]
names(omics.model)[renaming.vars.index] <- renaming.vars
# Normalize variables
omics.model[is.na(omics.model)] <- 0.0
maxs <- apply(omics.model, 2, max)
mins <- apply(omics.model, 2, min)
num.vars <- ncol(omics.model) - 1
omics.model.sc <- as.data.frame(scale(omics.model, center = mins, scale = maxs - mins))
rownames(omics.model.sc) <- omics.key_protein
return(omics.model.sc)
}
#' evaluate_ML
#' @description evaluate a Machine Learning prediction
#'
#' @param predictor ML predict output object
#' @param testset data set for testing
#' @param class_category classification category
#' @param save.to path to save results. When NULL, results will be displayed online
#'
#' @export
#'
evaluate_ML <- function(predictor, testset, class_category, evaluation.features=NULL, save.to=NULL, trainset=NULL){
if(!is.null(save.to)) mkdir(save.to)
pred.summary <- summary(predictor)
testset.new <- testset
testset.new$classification <- testset[, class_category]
prediction_done = F
# If this is a neuralnetwork object it works different to other predictors
if(any(c("nn") %in% class(predictor)) ){
vals <- as.matrix(testset[, predictor$model.list$variables])
testset.new$prediction <- as.numeric(neuralnet::compute(predictor, vals)$net.result)
if(!is.null(save.to)){
pdf(file.path(save.to, "plotnet.pdf"), 30,9)
NeuralNetTools::plotnet(predictor)
dev.off()
}
prediction_done = T
}
if( any(c("mlp", "nnet") %in% class(predictor)) ){
testset.new$prediction <- as.numeric(
predict(predictor, testset[, (names(testset) %in% evaluation.features) ]))
if(!is.null(save.to)){
pdf(file.path(save.to, "plotnet.pdf"), 30,9)
NeuralNetTools::plotnet(predictor)
dev.off()
}
prediction_done = T
}
if(!prediction_done){
testset.new$prediction <- as.numeric(predict(predictor, testset))
}
f <- as.formula(paste(class_category, "prediction", sep = "~"))
my.roc <- pROC::roc(classification ~ prediction, data=testset.new, plot=F, smooth=F)
if(!is.null(save.to)) pdf(file.path(save.to, "roc_plot.pdf"))
plot(my.roc)
if(!is.null(save.to)) dev.off()
if(!is.null(save.to)) sink(file.path(save.to, "prediction_summary.txt"))
print(pred.summary)
if(!is.null(save.to)) sink()
if(!is.null(save.to)) pdf(file.path(save.to, "separation_plots.pdf"))
colors <- ifelse(my.roc$response == 0, "black", "red")
plot(my.roc$predictor,pch=19,cex=0.3, col=colors)
y = as.numeric(my.roc$predictor)
x = as.numeric(my.roc$response)
plot(density(y[x==0]))
lines(density(y[x>0]), col="red")
if(!is.null(save.to)) dev.off()
if(!is.null(save.to)) saveRDS(predictor, file=file.path(save.to, "predictor.Rds" ))
if(!is.null(save.to)) sink(file.path(save.to, "prediction_ROC_summary.txt"))
print(my.roc$call)
print(my.roc$auc)
print(cat("Classification levels:", my.roc$levels) )
print(cat("Direction:", my.roc$direction))
print(cat("Calculated in percent:", my.roc$percent))
print(cat("Sensitivities:", my.roc$sensitivities))
print(cat("Specifities:", my.roc$specificities))
print(cat("Thresholds:", my.roc$thresholds))
if(!is.null(save.to)) sink()
if(!is.null(save.to)){
if(any(class(predictor) == "nnet")) return(my.roc)
trs = trainset
lek <- NeuralNetTools::lekprofile(predictor, trs)
ggplot2::ggsave(file.path(save.to, "lek.pdf"),plot = lek, width = 50, height = 9, limitsize = F)
}
return(my.roc)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.