knitr::opts_chunk$set( collapse = TRUE, fig.width=7, fig.height=5, fig.path='Figs/', fig.align="left", warning=FALSE, message=FALSE, comment = "#>" )
library(JTIMLmaster) library(Biobase) library(DESeq2) library(edgeR) library(ggplot2) library(SmartSVA) library(pheatmap) library(data.table) library(plyr) library(dplyr) library(DT) library(car) library(randomForest) library(glmnet) library(caret) library(e1071) library(VSURF) library(neuralnet) library(gbm) library(verification) library(pROC)
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
Can you build a machine learning model to accurately predict whether or not the patients in the dataset have diabetes or not?
This vignette was created to show how to apply Machine Learning approaches into a diabete data
diabetes_data$Outcome <- factor(diabetes_data$Outcome, levels = c("0", "1")) ## The data name is diabetes_data set.seed(415) K <- 10 ## Number of replicates n.case <- which(diabetes_data$Outcome==1) n.control <- which(diabetes_data$Outcome==0) train_rows <- lapply(1:K, function(x){c(sample(n.case, length(n.case), replace = TRUE), sample(n.control, length(n.control), replace = TRUE))}) depVar <- "Outcome" alphaLevel <- c(0.4, 0.7, 1) ntree <- 100 RFresult <- lapply(train_rows, function(x){RF.boot.err.Func(train=diabetes_data[x, ], test=diabetes_data[-x, ], depVar = depVar, ntree=ntree)}) names(RFresult) <- paste0("RF_iter", 1:K) PRresult <- lapply(train_rows, function(x){PR.boot.err.Func(x.train=diabetes_data[x, (names(diabetes_data)!=depVar)], y.train=diabetes_data[x, depVar], x.test=diabetes_data[-x, (names(diabetes_data)!=depVar)], y.test=diabetes_data[-x, depVar], alphaLevel=alphaLevel, family="binomial", type="class")}) names(PRresult) <- paste0("PR_iter", 1:K) ### SVM ### SVMresult <- lapply(train_rows, function(x){SVM.boot.err.Func(train=diabetes_data[x, ], test=diabetes_data[-x, ], depVar= depVar, kernel="radial", cost=5)}) names(SVMresult) <- paste0("SVM_iter", 1:K) ### Neural Network ### NNresult <- lapply(train_rows, function(x){NN.boot.err.Func(train=diabetes_data[x, ], test=diabetes_data[-x, ], depVar= depVar, hidden=10)}) names(NNresult) <- paste0("NN_iter", 1:K) ### Gredient Boosting (gbm R package) ### ### Somehow predicted values are all 0. So we cannot compare them to the true values. GBMresult <- lapply(train_rows, function(x){GBM.boot.err.Func(train=diabetes_data[x, ], test=diabetes_data[-x, ], depVar= depVar, distribution="adaboost", n.trees = ntree)}) names(GBMresult) <- paste0("GBM_iter", 1:K)
The following plot and table show the estimated accuracy measures and misclassification across all replicates.
RF.err.measure <- data.frame(AUC=unlist(lapply(RFresult, function(x){x$RF.err[1,"overall"]})), Sensitivity= unlist(lapply(RFresult, function(x){x$RF.err[2,"overall"]})), Specificity= unlist(lapply(RFresult, function(x){x$RF.err[3,"overall"]})), Misclassification= unlist(lapply(RFresult, function(x){x$RF.err[4,"overall"]}))) RF.err.measure.melt <- melt(RF.err.measure) names(RF.err.measure.melt) <- c("accuracy.measure", "value") RF.err.measure.melt$accuracy.measure <- factor(RF.err.measure.melt$accuracy.measure, levels = c("AUC", "Sensitivity", "Specificity", "Misclassification")) ggplot(RF.err.measure.melt) + geom_boxplot(aes(x=accuracy.measure, y=value*100, fill=accuracy.measure)) + ggtitle(paste0("Random Forest (K=", K, ", ntree=", ntree, ")")) + ylab("percentage(%)")
tempD <- ddply(RF.err.measure.melt, ~ accuracy.measure, summarise, mean= round(mean(value), digits = 4), sd=round(sd(value), digits = 4)) tempD$value <- paste0(tempD$mean, " (sd=", tempD$sd, ")") tempD$mean <- tempD$sd <- NULL DT::datatable(tempD, caption = paste0("Mean(SD) of Overall Accuracy Measures using RF (K=", K, ")"), rownames = FALSE)
The following table shows their importance measures, which were ordered by mean of decreases accuracy. Top ranked variables will be included in the final model.
all.important.measure <- lapply(RFresult, function(x){x$importance_measure[order(rownames(x$importance_measure)),]}) all.important.measure.DF <- do.call("cbind", all.important.measure) all.important.measure.mean.decrease.acc.mean <- apply(all.important.measure.DF[, grep("Mean.Decrease.Accuracy", names(all.important.measure.DF))], 1, mean) all.important.measure.mean.Decrease.Gini.mean <- apply(all.important.measure.DF[, grep("Mean.Decrease.Gini", names(all.important.measure.DF))], 1, mean) important_measure_mean <- data.frame(Mean.Decrease.Accurary=round(all.important.measure.mean.decrease.acc.mean, 4), Mean.Decrease.Gini=round(all.important.measure.mean.Decrease.Gini.mean, 4)) important_measure_mean <- important_measure_mean[order(important_measure_mean$Mean.Decrease.Accurary, decreasing = TRUE),] DT::datatable(important_measure_mean, caption = paste0("Mean of Importance Measures from Random Forest (K=", K, ")"))
The following plot and table show the estimated accurasy measures averaged over all replicates based on penalized linear regression models with different tuning parameters.
## For each iteration, Penalized Regression run returns a matrix of ModelXerr.measures ## 1. Combine them across all K iterations. This returns a matrix of ModelxK dimension for each element ## 2. Plot the error.measure of AUC, Sensitivity, Sepecificity, and MC and faceted by Model PR.err.measure <- list(AUC=do.call("rbind", lapply(PRresult, function(x){x$PR.err.vect["AUC.overall",]})), Sensitivity= do.call("rbind", lapply(PRresult, function(x){x$PR.err.vect["sensitivity.overall",]})), Specificity= do.call("rbind", lapply(PRresult, function(x){x$PR.err.vect["specificity.overall",]})), Misclassification= do.call("rbind", lapply(PRresult, function(x){x$PR.err.vect["misclassification.overall",]}))) PR.err.measure <- lapply(PR.err.measure, function(x){as.data.frame(t(x), row.names = c("Elastic Net(alpha=0.4)", "Elastic Net(alpha=0.7)", "LASSO(alpha=1)"))}) PR.err.measure.DF <- do.call("cbind", PR.err.measure) PR.err.measure.DF$Model <- rownames(PR.err.measure.DF) PR.err.measure.DF.melt <- melt(PR.err.measure.DF, id="Model") PR.err.measure.DF.melt$variable <- factor(unlist(lapply(strsplit(as.character(PR.err.measure.DF.melt$variable), "[.]"), function(x){x[[1]]})), levels=c("AUC", "Sensitivity", "Specificity", "Misclassification")) PR.err.measure.DF.melt$Model <- factor(PR.err.measure.DF.melt$Model, levels=c("Elastic Net(alpha=0.4)", "Elastic Net(alpha=0.7)", "LASSO(alpha=1)")) ggplot(PR.err.measure.DF.melt) + geom_boxplot(aes(x=variable, y=value*100,fill=variable)) + facet_grid(~ Model)+ ylab("percentage(%)") + ggtitle(paste0("Penalized Regressions (K=", K, ", Lasso/EN)")) + ylab("percentage(%)") ## 3. Mean and SD of error.measures for each Model PR.err.measure.mean <- do.call("cbind", lapply(PR.err.measure, function(x){paste0(round(apply(x, 1, mean), 4), " (sd=", round(apply(x, 1, sd), 4), ")")})) rownames(PR.err.measure.mean) <- c("Elastic Net(alpha=0.4)", "Elastic Net(alpha=0.7)", "LASSO(alpha=1)") DT::datatable(PR.err.measure.mean, caption = paste0("Mean (SD) of Accuracy Measures using Penalized Regression, K=", K, ")"), rownames = TRUE)
The following table shows how many times each variable included in the mdoel was selected across all replicates. The variables are ranked by the frequency and top ranked ones will be used as predictors.
## 4. non zero coefficients and cound them how many times each variable selected ## For each model (alpha level), we order selected(non-zero coefficients) variables by their counts. PR.non0coeff <- list(alphaLevel_04=lapply(PRresult, function(x){x$non0coeff[[1]]}), alphaLevel_07=lapply(PRresult, function(x){x$non0coeff[[2]]}), alphaLevel_1=lapply(PRresult, function(x){x$non0coeff[[3]]})) PR.non0coeff_variables <- list(alphaLevel_04=lapply(PRresult, function(x){names(x$non0coeff[[1]])}), alphaLevel_07=lapply(PRresult, function(x){names(x$non0coeff[[2]])}), alphaLevel_1=lapply(PRresult, function(x){names(x$non0coeff[[3]])})) union_non0coeff <- data.frame(Variables=unique(unlist(lapply(PR.non0coeff_variables, function(x){unlist(x)})))) union.all.non0coeff <- lapply(PR.non0coeff_variables, function(x){table(unlist(x))}) union.all.non0coeff <- lapply(union.all.non0coeff, function(x){merge(union_non0coeff, data.frame(Variables= names(x), Counts=as.vector(x)), by="Variables", all.x=TRUE)}) #union.all.non0coeff <- lapply(union.all.non0coeff, function(x){x[order(x$Counts, decreasing=TRUE),]}) union.all.non0coeff_all <- merge(merge(union.all.non0coeff[[1]], union.all.non0coeff[[2]], by="Variables"), union.all.non0coeff[[3]], by="Variables") names(union.all.non0coeff_all) <- c("Variables", "alphaLevel=0.4", "alphaLevel=0.7", "alphaLevel=1") union.all.non0coeff_all <- union.all.non0coeff_all[order(union.all.non0coeff_all$`alphaLevel=0.4`, decreasing = TRUE),] union.all.non0coeff_all$Variables <- as.character(union.all.non0coeff_all$Variables) DT::datatable(union.all.non0coeff_all[-1,], rownames = FALSE, caption = paste0("Selected Variables and its counts for each alpha (K=", K, ")")) ## Select top ranked genes having more than 70% out of K times selected in across all models ## ## Estimate these genes' coefficients in Logistic model and then use them as Polygenic score for the new testing data (Marine data) polygenicGenes <- union.all.non0coeff_all$Variables[which(union.all.non0coeff_all$`alphaLevel=1`>=round(0.7*K))] polygenicGenes <- polygenicGenes[polygenicGenes!="(Intercept)"]
The following plot and table show the estimated accuracy measures based on SVM.
SVM.err.measure <- data.frame(AUC=unlist(lapply(SVMresult, function(x){x$SVM.err[1,"overall"]})), Sensitivity= unlist(lapply(SVMresult, function(x){x$SVM.err[2,"overall"]})), Specificity= unlist(lapply(SVMresult, function(x){x$SVM.err[3,"overall"]})), Misclassification= unlist(lapply(SVMresult, function(x){x$SVM.err[4,"overall"]}))) SVM.err.measure.melt <- melt(SVM.err.measure) names(SVM.err.measure.melt) <- c("accuracy.measure", "value") SVM.err.measure.melt$accuracy.measure <- factor(SVM.err.measure.melt$accuracy.measure, levels = c("AUC", "Sensitivity", "Specificity", "Misclassification")) ggplot(SVM.err.measure.melt) + geom_boxplot(aes(x=accuracy.measure, y=value*100, fill=accuracy.measure)) + ggtitle(paste0("Support Vector Machine (K=", K, ", Radial Kernal)")) + ylab("percentage(%)")
tempD <- ddply(SVM.err.measure.melt, ~ accuracy.measure, summarise, mean= round(mean(value), digits = 4), sd=round(sd(value), digits = 4)) tempD$value <- paste0(tempD$mean, " (sd=", tempD$sd, ")") tempD$mean <- tempD$sd <- NULL DT::datatable(tempD, caption = paste0("Mean(SD) of Overall Accuracy Measures using SVM (K=", K, ")"), rownames = FALSE)
The following plot and table show the estimated accuracy measures based on NN.
NN.err.measure <- data.frame(AUC=unlist(lapply(NNresult, function(x){x$NN.err[1,"overall"]})), Sensitivity= unlist(lapply(NNresult, function(x){x$NN.err[2,"overall"]})), Specificity= unlist(lapply(NNresult, function(x){x$NN.err[3,"overall"]})), Misclassification= unlist(lapply(NNresult, function(x){x$NN.err[4,"overall"]}))) NN.err.measure.melt <- melt(NN.err.measure) names(NN.err.measure.melt) <- c("accuracy.measure", "value") NN.err.measure.melt$accuracy.measure <- factor(NN.err.measure.melt$accuracy.measure, levels = c("AUC", "Sensitivity", "Specificity", "Misclassification")) ggplot(NN.err.measure.melt) + geom_boxplot(aes(x=accuracy.measure, y=value*100, fill=accuracy.measure)) + ggtitle(paste0("Neural Network (K=", K, ", 10 layers)")) + ylab("percentage(%)")
tempD <- ddply(NN.err.measure.melt, ~ accuracy.measure, summarise, mean= round(mean(value), digits = 4), sd=round(sd(value), digits = 4)) tempD$value <- paste0(tempD$mean, " (sd=", tempD$sd, ")") tempD$mean <- tempD$sd <- NULL DT::datatable(tempD, caption = paste0("Mean(SD) of Overall Accuracy Measures using SVM (K=", K, ")"), rownames = FALSE)
The following plot and table show the estimated accuracy measures based on GRM.
GBM.err.measure <- data.frame(AUC=unlist(lapply(GBMresult, function(x){x$GBM.err[1,"overall"]})), Sensitivity= unlist(lapply(GBMresult, function(x){x$GBM.err[2,"overall"]})), Specificity= unlist(lapply(GBMresult, function(x){x$GBM.err[3,"overall"]})), Misclassification= unlist(lapply(GBMresult, function(x){x$GBM.err[4,"overall"]}))) GBM.err.measure.melt <- melt(GBM.err.measure) names(GBM.err.measure.melt) <- c("accuracy.measure", "value") GBM.err.measure.melt$accuracy.measure <- factor(GBM.err.measure.melt$accuracy.measure, levels = c("AUC", "Sensitivity", "Specificity", "Misclassification")) ggplot(GBM.err.measure.melt) + geom_boxplot(aes(x=accuracy.measure, y=value*100, fill=accuracy.measure)) + ggtitle(paste0("Gredient Boosting (K=", K, ")")) + ylab("percentage(%)")
tempD <- ddply(GBM.err.measure.melt, ~ accuracy.measure, summarise, mean= round(mean(value), digits = 4), sd=round(sd(value), digits = 4)) tempD$value <- paste0(tempD$mean, " (sd=", tempD$sd, ")") tempD$mean <- tempD$sd <- NULL DT::datatable(tempD, caption = paste0("Mean(SD) of Overall Accuracy Measures using SVM (K=", K, ")"), rownames = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.