Load data
untar <- read.csv("../data/zscore_untargeted_annotatedions.csv") clin <- read.csv("../data/partitioning.csv") names(clin)[1] <- "patients" untarmet <- read.csv("../data/CRIC_clinical_creatinine_normalized_samplemetadata.csv", header = FALSE) jing <- read.csv("../data/60 var selected by 8 models from lasso RF TB (models with both ions and cli).csv") jing_ions <- as.numeric(unique(substring(jing$Ion, 5,15))) # save copy of original untargeted ions original_untar <- untar # only include V3Y0 untar <- untar[untar$visit == "V3Y0",] pats <- untar$patientid # include only jing ions untar <- untar[,names(untar) %in% paste0("V", jing_ions)] # combine untargeted and clinical data untar$patients <- pats combined <- merge(untar, clin, by = "patients") # separate into groups, remove patient information train1 <- combined[combined$group == 0,][-1] train2 <- combined[combined$group == 1,][-1] train3 <- combined[combined$group == 2,][-1] clin1 <- clin[clin$group == 0,][-1] clin2 <- clin[clin$group == 1,][-1] clin3 <- clin[clin$group == 2,][-1]
library(caret) library(ModelMetrics) library(randomForest) library(knitr) library(caret) library(e1071) library(pROC) library(ROCR) linear_model <- function(model, data, newdata, predict) { if (predict == "rate") { training_MSE <- mse(model$fitted.values, data$rawdata.mu_egfr) pred <- predict(model, newdata = newdata) testing_MSE <- mse(pred,newdata$rawdata.mu_egfr) se <- (pred-newdata$rawdata.mu_egfr)^2 } else if (predict == "volatility") { training_MSE <- mse(model$fitted.values, data$rawdata.sigma_egfr) pred <- predict(model, newdata = newdata) testing_MSE <- mse(pred,newdata$rawdata.sigma_egfr) se <- (pred-newdata$rawdata.sigma_egfr)^2 } return(se) } rf_model <- function(model, data, newdata, predict) { if (predict == "rate") { training_MSE <- mse(model$predicted, data$rawdata.mu_egfr) pred <- predict(model, newdata = newdata) testing_MSE <- mse(pred,newdata$rawdata.mu_egfr) se <- (pred-newdata$rawdata.mu_egfr)^2 } else if (predict == "volatility") { training_MSE <- mse(model$predicted, data$rawdata.sigma_egfr) pred <- predict(model, newdata = newdata) testing_MSE <- mse(pred,newdata$rawdata.sigma_egfr) se <- (pred-newdata$rawdata.sigma_egfr)^2 } return(se) }
Predicting EGFR Rate with Clinical Variables
model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(clin1, clin2) ) lcr1 <- linear_model(model, rbind(clin1,clin2), clin3, predict = "rate") model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(clin2, clin3)) lcr2 <- linear_model(model, rbind(clin2,clin3), clin1, predict = "rate") model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(clin1, clin3)) lcr3 <- linear_model(model, rbind(clin1,clin3), clin2, predict = "rate")
Predicting EGFR Volatility with Clinical Variables
model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(clin1, clin2)) lcv1 <- linear_model(model, rbind(clin1,clin2), clin3, predict = "volatility") model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(clin2, clin3)) lcv2 <- linear_model(model, rbind(clin2,clin3), clin1, predict = "volatility") model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(clin1, clin3)) lcv3 <- linear_model(model, rbind(clin1,clin3), clin2, predict = "volatility")
Predicting EGFR Rate with Clinical Variables and Untargeted Ions
model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(train1, train2)) lbr1 <- linear_model(model, rbind(train1,train2), train3, predict = "rate") model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(train2, train3)) lbr2 <- linear_model(model, rbind(train2,train3), train1, predict = "rate") model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(train1, train3)) lbr3 <- linear_model(model, rbind(train1,train3), train2, predict = "rate")
Predicting EGFR Volatility with combical Variables and Untargeted Ions
model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(train1, train2)) lbv1 <- linear_model(model, rbind(train1,train2), train3, predict = "volatility") model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(train2, train3)) lbv2 <- linear_model(model, rbind(train2,train3), train1, predict = "volatility") model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(train1, train3)) lbv3 <- linear_model(model, rbind(train1,train3), train2, predict = "volatility")
Predicting EGFR Rate with Clinical Variables
rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(clin1,clin2), importance = TRUE, na.action = "na.exclude") rfcr1 <- rf_model(rf, rbind(clin1,clin2), clin3, predict = "rate") rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(clin2,clin3), importance = TRUE, na.action = "na.exclude") rfcr2 <- rf_model(rf, rbind(clin2,clin3), clin1, predict = "rate") rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(clin1,clin3), importance = TRUE, na.action = "na.exclude") rfcr3 <- rf_model(rf, rbind(clin1,clin3), clin2, predict = "rate")
Predicting EGFR Volatility with Clinical Variables
rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(clin1,clin2), importance = TRUE, na.action = "na.exclude") rfcv1 <- rf_model(rf, rbind(clin1,clin2), clin3, predict = "volatility") rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(clin2,clin3), importance = TRUE, na.action = "na.exclude") rfcv2 <- rf_model(rf, rbind(clin2,clin3), clin1, predict = "volatility") rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(clin1,clin3), importance = TRUE, na.action = "na.exclude") rfcv3 <- rf_model(rf, rbind(clin1,clin3), clin2, predict = "volatility")
Predicting EGFR Rate with Clinical Variables and Untargeted Ions
rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(train1,train2), importance = TRUE, na.action = "na.exclude") rfbr1 <- rf_model(rf, rbind(train1,train2), train3, predict = "rate") rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(train2,train3), importance = TRUE, na.action = "na.exclude") rfbr2 <- rf_model(rf, rbind(train2,train3), train1, predict = "rate") rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(train1,train3), importance = TRUE, na.action = "na.exclude") rfbr3 <- rf_model(rf, rbind(train1,train3), train2, predict = "rate")
Predicting EGFR Volatility with clinical Variables and Untargeted Ions
rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(train1,train2), importance = TRUE, na.action = "na.exclude") rfbv1 <- rf_model(rf, rbind(train1,train2), train3, predict = "volatility") rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(train2,train3), importance = TRUE, na.action = "na.exclude") rfbv2 <- rf_model(rf, rbind(train2,train3), train1, predict = "volatility") rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(train1,train3), importance = TRUE, na.action = "na.exclude") rfbv3 <- rf_model(rf, rbind(train1,train3), train2, predict = "volatility")
df <- data.frame("lbr1" = lbr1, "lbr2" = c(lbr2, rep(NA, 2)), "lbr3" = lbr3, "lbv1" = lbv1, "lbv2" = c(lbv2, rep(NA, 2)), "lbv3" = lbv3, "rfbr1" = rfbr1, "rfbr2" = c(rfbr2, rep(NA, 2)), "rfbr3" = rfbr3, "rfbv1" = rfbv1, "rfbv2" = c(rfbv2, rep(NA, 2)), "rfbv3" = rfbv3 ) #write.csv(df, "../data/reference_model_df.csv")
Plots for each model
boxplot(lcr1, lcr2, lcr3, outline = FALSE) title("Linear model for Rate with Clinical, 3x Cross Val.") boxplot(lcv1, lcv2, lcv3, outline = FALSE) title("Linear model for Volatility with Clinical, 3x Cross Val.") boxplot(lbr1, lbr2, lbr3, outline = FALSE) title("Linear model for Rate with Clinical and Ions, 3x Cross Val.") boxplot(lbv1, lbv2, lbv3, outline = FALSE) title("Linear model for Volatility with Clinical and Ions, 3x Cross Val.") boxplot(rfcr1, rfcr2, rfcr3, outline = FALSE) title("Random Forest model for Rate with Clinical, 3x Cross Val.") boxplot(rfcv1, rfcv2, rfcv3, outline = FALSE) title("Random Forest model for Volatility with Clinical, 3x Cross Val.") boxplot(rfbr1, rfbr2, rfbr3, outline = FALSE) title("Random Forest model for Rate with Clinical and Ions, 3x Cross Val.") boxplot(rfbv1, rfbv2, rfbv3, outline = FALSE) title("Random Forest model for Volatility with Clinical and Ions, 3x Cross Val.")
Plots comparing models (using ggplot and with p vals comparing means, first rep from cross val)
library(ggplot2) library(ggpubr) rate_data <- data.frame("model" = c(rep("Linear: Clin", length(lcr1)), rep("Linear: Clin and Ions", length(lbr1)), rep("RF: Clin", length(rfcr1)), rep("RF: Clin and Ions", length(rfbr1))), "error" = c(lcr1, lbr1, rfcr1, rfbr1) ) rate_data$model <- factor(rate_data$model, levels=c("Linear: Clin", "Linear: Clin and Ions", "RF: Clin", "RF: Clin and Ions")) compare_means(error~model, data = rate_data) my_comparisons <- list(c("Linear: Clin", "Linear: Clin and Ions"), c("RF: Clin", "RF: Clin and Ions")) ggplot(rate_data, aes(x = model, y = error))+ geom_boxplot(outlier.shape = NA) + stat_compare_means(comparisons = my_comparisons, method = "t.test", label.y = 0.017, tip.length = 0.001) + ylim(0, 0.018)+ ggtitle("MSE for Models Predicting EGFR Rate of Decline") vol_data <- data.frame("model" = c(rep("Linear: Clin", length(lcv1)), rep("Linear: Clin and Ions", length(lbv1)), rep("RF: Clin", length(rfcv1)), rep("RF: Clin and Ions", length(rfbv1))), "error" = c(lcv1, lbv1, rfcv1, rfbv1) ) vol_data$model <- factor(vol_data$model, levels=c("Linear: Clin", "Linear: Clin and Ions", "RF: Clin", "RF: Clin and Ions")) compare_means(error~model, data = vol_data) my_comparisons <- list(c("Linear: Clin", "Linear: Clin and Ions"), c("RF: Clin", "RF: Clin and Ions")) ggplot(vol_data, aes(x = model, y = error))+ geom_boxplot(outlier.shape = NA) + stat_compare_means(comparisons = my_comparisons, method = "t.test", label.y = 0.0055, tip.length = 0.001) + ylim(0, 0.006)+ ggtitle("MSE for Models Predicting EGFR Volatility")
Table of MSEs (first rep from cross val)
mse <- data.frame("Linear with Clinical" = c(round(mean(lcr1), digits = 3), round(mean(lcv1), digits = 3)), "Linear with Clinical and Ions" = c(round(mean(lbr1), digits = 3), round(mean(lbv1), digits = 3)), "RF with Clinical" = c(round(mean(rfcr1), digits = 3), round(mean(rfcv1), digits = 3)), "RF with Clinical and Ions" = c(round(mean(rfbr1), digits = 3), round(mean(rfbv1), digits = 3))) rownames(mse) <- c("Rate", "Volatility") kable(mse) write.csv(mse, "../data/reference_model_mses.csv")
Load coordinate data
pca_coords <- read.csv("../data/pca_train_coords.csv") pca_coords_t1 <- merge(clin[clin$group == 0,], pca_coords, by = "patients")[-1] pca_coords_t2 <- merge(clin[clin$group == 1,], pca_coords, by = "patients")[-1] pca_coords_t3 <- merge(clin[clin$group == 2,], pca_coords, by = "patients")[-1]
Linear Models Predicting Rate
model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(pca_coords_t1, pca_coords_t2)) pca_lbr1 <- linear_model(model, rbind(pca_coords_t1,pca_coords_t2), pca_coords_t3, predict = "rate") model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(pca_coords_t2, pca_coords_t3)) pca_lbr2 <- linear_model(model, rbind(pca_coords_t2,pca_coords_t3), pca_coords_t1, predict = "rate") model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(pca_coords_t1, pca_coords_t3)) pca_lbr3 <- linear_model(model, rbind(pca_coords_t1,pca_coords_t3), pca_coords_t2, predict = "rate")
Predicting Volatility
model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(pca_coords_t1, pca_coords_t2)) pca_lbv1 <- linear_model(model, rbind(pca_coords_t1,pca_coords_t2), pca_coords_t3, predict = "volatility") model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(pca_coords_t2, pca_coords_t3)) pca_lbv2 <- linear_model(model, rbind(pca_coords_t2,pca_coords_t3), pca_coords_t1, predict = "volatility") model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(pca_coords_t1, pca_coords_t3)) pca_lbv3 <- linear_model(model, rbind(pca_coords_t1,pca_coords_t3), pca_coords_t2, predict = "volatility")
Random Forest Predicting Rate
rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(pca_coords_t1,pca_coords_t2), importance = TRUE, na.action = "na.exclude") pca_rfbr1 <- rf_model(rf, rbind(pca_coords_t1,pca_coords_t2), pca_coords_t3, predict = "rate") rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(pca_coords_t2,pca_coords_t3), importance = TRUE, na.action = "na.exclude") pca_rfbr2 <- rf_model(rf, rbind(pca_coords_t2,pca_coords_t3), pca_coords_t1, predict = "rate") rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(pca_coords_t1,pca_coords_t3), importance = TRUE, na.action = "na.exclude") pca_rfbr3 <- rf_model(rf, rbind(pca_coords_t1,pca_coords_t3), pca_coords_t2, predict = "rate")
Predicting Volatility
rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(pca_coords_t1,pca_coords_t2), importance = TRUE, na.action = "na.exclude") pca_rfbv1 <- rf_model(rf, rbind(pca_coords_t1,pca_coords_t2), pca_coords_t3, predict = "volatility") rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(pca_coords_t2,pca_coords_t3), importance = TRUE, na.action = "na.exclude") pca_rfbv2 <- rf_model(rf, rbind(pca_coords_t2,pca_coords_t3), pca_coords_t1, predict = "volatility") rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(pca_coords_t1,pca_coords_t3), importance = TRUE, na.action = "na.exclude") pca_rfbv3 <- rf_model(rf, rbind(pca_coords_t1,pca_coords_t3), pca_coords_t2, predict = "volatility")
Plots comparing models
pca_rate_data <- data.frame("model" = c(rep("Linear: Clin", length(lcr1)), rep("Linear: Clin and PCA Coords", length(pca_lbr1)), rep("RF: Clin", length(rfcr1)), rep("RF: Clin and PCA Coords", length(pca_rfbr1))), "error" = c(lcr1, pca_lbr1, rfcr1, pca_rfbr1) ) pca_rate_data$model <- factor(pca_rate_data$model, levels=c("Linear: Clin", "Linear: Clin and PCA Coords", "RF: Clin", "RF: Clin and PCA Coords")) compare_means(error~model, data = pca_rate_data) my_comparisons <- list(c("Linear: Clin", "Linear: Clin and PCA Coords"), c("RF: Clin", "RF: Clin and PCA Coords")) ggplot(pca_rate_data, aes(x = model, y = error))+ geom_boxplot(outlier.shape = NA) + stat_compare_means(comparisons = my_comparisons, method = "t.test", label.y = 0.017, tip.length = 0.001) + ylim(0, 0.018)+ ggtitle("MSE for Models Predicting EGFR Rate of Decline") pca_vol_data <- data.frame("model" = c(rep("Linear: Clin", length(lcv1)), rep("Linear: Clin and PCA Coords", length(pca_lbv1)), rep("RF: Clin", length(rfcv1)), rep("RF: Clin and PCA Coords", length(pca_rfbv1))), "error" = c(lcv1, pca_lbv1, rfcv1, pca_rfbv1) ) pca_vol_data$model <- factor(pca_vol_data$model, levels=c("Linear: Clin", "Linear: Clin and PCA Coords", "RF: Clin", "RF: Clin and PCA Coords")) compare_means(error~model, data = pca_vol_data) my_comparisons <- list(c("Linear: Clin", "Linear: Clin and PCA Coords"), c("RF: Clin", "RF: Clin and PCA Coords")) ggplot(pca_vol_data, aes(x = model, y = error))+ geom_boxplot(outlier.shape = NA) + stat_compare_means(comparisons = my_comparisons, method = "t.test", label.y = 0.0055, tip.length = 0.001) + ylim(0, 0.006)+ ggtitle("MSE for Models Predicting EGFR Volatility")
Load coordinate data
tsne_coords <- read.csv("../data/tsne_train_coords.csv") tsne_coords_t1 <- merge(clin[clin$group == 0,], tsne_coords, by = "patients")[-1] tsne_coords_t2 <- merge(clin[clin$group == 1,], tsne_coords, by = "patients")[-1] tsne_coords_t3 <- merge(clin[clin$group == 2,], tsne_coords, by = "patients")[-1]
Linear Models Predicting Rate
model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(tsne_coords_t1, tsne_coords_t2)) tsne_lbr1 <- linear_model(model, rbind(tsne_coords_t1,tsne_coords_t2), tsne_coords_t3, predict = "rate") model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(tsne_coords_t2, tsne_coords_t3)) tsne_lbr2 <- linear_model(model, rbind(tsne_coords_t2,tsne_coords_t3), tsne_coords_t1, predict = "rate") model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(tsne_coords_t1, tsne_coords_t3)) tsne_lbr3 <- linear_model(model, rbind(tsne_coords_t1,tsne_coords_t3), tsne_coords_t2, predict = "rate")
Predicting Volatility
model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(tsne_coords_t1, tsne_coords_t2)) tsne_lbv1 <- linear_model(model, rbind(tsne_coords_t1,tsne_coords_t2), tsne_coords_t3, predict = "volatility") model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(tsne_coords_t2, tsne_coords_t3)) tsne_lbv2 <- linear_model(model, rbind(tsne_coords_t2,tsne_coords_t3), tsne_coords_t1, predict = "volatility") model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(tsne_coords_t1, tsne_coords_t3)) tsne_lbv3 <- linear_model(model, rbind(tsne_coords_t1,tsne_coords_t3), tsne_coords_t2, predict = "volatility")
Random Forest Predicting Rate
rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(tsne_coords_t1,tsne_coords_t2), importance = TRUE, na.action = "na.exclude") tsne_rfbr1 <- rf_model(rf, rbind(tsne_coords_t1,tsne_coords_t2), tsne_coords_t3, predict = "rate") rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(tsne_coords_t2,tsne_coords_t3), importance = TRUE, na.action = "na.exclude") tsne_rfbr2 <- rf_model(rf, rbind(tsne_coords_t2,tsne_coords_t3), tsne_coords_t1, predict = "rate") rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(tsne_coords_t1,tsne_coords_t3), importance = TRUE, na.action = "na.exclude") tsne_rfbr3 <- rf_model(rf, rbind(tsne_coords_t1,tsne_coords_t3), tsne_coords_t2, predict = "rate")
Predicting Volatility
rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(tsne_coords_t1,tsne_coords_t2), importance = TRUE, na.action = "na.exclude") tsne_rfbv1 <- rf_model(rf, rbind(tsne_coords_t1,tsne_coords_t2), tsne_coords_t3, predict = "volatility") rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(tsne_coords_t2,tsne_coords_t3), importance = TRUE, na.action = "na.exclude") tsne_rfbv2 <- rf_model(rf, rbind(tsne_coords_t2,tsne_coords_t3), tsne_coords_t1, predict = "volatility") rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(tsne_coords_t1,tsne_coords_t3), importance = TRUE, na.action = "na.exclude") tsne_rfbv3 <- rf_model(rf, rbind(tsne_coords_t1,tsne_coords_t3), tsne_coords_t2, predict = "volatility")
Plots comparing models
tsne_rate_data <- data.frame("model" = c(rep("Linear: Clin", length(lcr1)), rep("Linear: Clin and TSNE Coords", length(tsne_lbr1)), rep("RF: Clin", length(rfcr1)), rep("RF: Clin and TSNE Coords", length(tsne_rfbr1))), "error" = c(lcr1, tsne_lbr1, rfcr1, tsne_rfbr1) ) tsne_rate_data$model <- factor(tsne_rate_data$model, levels=c("Linear: Clin", "Linear: Clin and TSNE Coords", "RF: Clin", "RF: Clin and TSNE Coords")) compare_means(error~model, data = tsne_rate_data) my_comparisons <- list(c("Linear: Clin", "Linear: Clin and TSNE Coords"), c("RF: Clin", "RF: Clin and TSNE Coords")) ggplot(tsne_rate_data, aes(x = model, y = error))+ geom_boxplot(outlier.shape = NA) + stat_compare_means(comparisons = my_comparisons, method = "t.test", label.y = 0.017, tip.length = 0.001) + ylim(0, 0.018)+ ggtitle("MSE for Models Predicting EGFR Rate of Decline") tsne_vol_data <- data.frame("model" = c(rep("Linear: Clin", length(lcv1)), rep("Linear: Clin and TSNE Coords", length(tsne_lbv1)), rep("RF: Clin", length(rfcv1)), rep("RF: Clin and TSNE Coords", length(tsne_rfbv1))), "error" = c(lcv1, tsne_lbv1, rfcv1, tsne_rfbv1) ) tsne_vol_data$model <- factor(tsne_vol_data$model, levels=c("Linear: Clin", "Linear: Clin and TSNE Coords", "RF: Clin", "RF: Clin and TSNE Coords")) compare_means(error~model, data = tsne_vol_data) my_comparisons <- list(c("Linear: Clin", "Linear: Clin and TSNE Coords"), c("RF: Clin", "RF: Clin and TSNE Coords")) ggplot(tsne_vol_data, aes(x = model, y = error))+ geom_boxplot(outlier.shape = NA) + stat_compare_means(comparisons = my_comparisons, method = "t.test", label.y = 0.0055, tip.length = 0.001) + ylim(0, 0.006)+ ggtitle("MSE for Models Predicting EGFR Volatility")
Load coordinate data
som_coords <- read.csv("../data/som_train_coords.csv") som_coords_t1 <- merge(clin[clin$group == 0,], som_coords, by = "patients")[-1] som_coords_t2 <- merge(clin[clin$group == 1,], som_coords, by = "patients")[-1] som_coords_t3 <- merge(clin[clin$group == 2,], som_coords, by = "patients")[-1]
Linear Models Predicting Rate
model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(som_coords_t1, som_coords_t2)) som_lbr1 <- linear_model(model, rbind(som_coords_t1,som_coords_t2), som_coords_t3, predict = "rate") model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(som_coords_t2, som_coords_t3)) som_lbr2 <- linear_model(model, rbind(som_coords_t2,som_coords_t3), som_coords_t1, predict = "rate") model <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(som_coords_t1, som_coords_t3)) som_lbr3 <- linear_model(model, rbind(som_coords_t1,som_coords_t3), som_coords_t2, predict = "rate")
Predicting Volatility
model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(som_coords_t1, som_coords_t2)) som_lbv1 <- linear_model(model, rbind(som_coords_t1,som_coords_t2), som_coords_t3, predict = "volatility") model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(som_coords_t2, som_coords_t3)) som_lbv2 <- linear_model(model, rbind(som_coords_t2,som_coords_t3), som_coords_t1, predict = "volatility") model <- lm(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group - group, data = rbind(som_coords_t1, som_coords_t3)) som_lbv3 <- linear_model(model, rbind(som_coords_t1,som_coords_t3), som_coords_t2, predict = "volatility")
Random Forest Predicting Rate
rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(som_coords_t1,som_coords_t2), importance = TRUE, na.action = "na.exclude") som_rfbr1 <- rf_model(rf, rbind(som_coords_t1,som_coords_t2), som_coords_t3, predict = "rate") rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(som_coords_t2,som_coords_t3), importance = TRUE, na.action = "na.exclude") som_rfbr2 <- rf_model(rf, rbind(som_coords_t2,som_coords_t3), som_coords_t1, predict = "rate") rf <- randomForest(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(som_coords_t1,som_coords_t3), importance = TRUE, na.action = "na.exclude") som_rfbr3 <- rf_model(rf, rbind(som_coords_t1,som_coords_t3), som_coords_t2, predict = "rate")
Predicting Volatility
rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(som_coords_t1,som_coords_t2), importance = TRUE, na.action = "na.exclude") som_rfbv1 <- rf_model(rf, rbind(som_coords_t1,som_coords_t2), som_coords_t3, predict = "volatility") rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(som_coords_t2,som_coords_t3), importance = TRUE, na.action = "na.exclude") som_rfbv2 <- rf_model(rf, rbind(som_coords_t2,som_coords_t3), som_coords_t1, predict = "volatility") rf <- randomForest(rawdata.sigma_egfr ~ . - rawdata.mu_egfr - rawdata.r_group - rawdata.v_group -group, data = rbind(som_coords_t1,som_coords_t3), importance = TRUE, na.action = "na.exclude") som_rfbv3 <- rf_model(rf, rbind(som_coords_t1,som_coords_t3), som_coords_t2, predict = "volatility")
Plots comparing models
som_rate_data <- data.frame("model" = c(rep("Linear: Clin", length(lcr1)), rep("Linear: Clin and SOM Coords", length(som_lbr1)), rep("RF: Clin", length(rfcr1)), rep("RF: Clin and SOM Coords", length(som_rfbr1))), "error" = c(lcr1, som_lbr1, rfcr1, som_rfbr1) ) som_rate_data$model <- factor(som_rate_data$model, levels=c("Linear: Clin", "Linear: Clin and SOM Coords", "RF: Clin", "RF: Clin and SOM Coords")) compare_means(error~model, data = som_rate_data) my_comparisons <- list(c("Linear: Clin", "Linear: Clin and SOM Coords"), c("RF: Clin", "RF: Clin and SOM Coords")) ggplot(som_rate_data, aes(x = model, y = error))+ geom_boxplot(outlier.shape = NA) + stat_compare_means(comparisons = my_comparisons, method = "t.test", label.y = 0.017, tip.length = 0.001) + ylim(0, 0.018)+ ggtitle("MSE for Models Predicting EGFR Rate of Decline") som_vol_data <- data.frame("model" = c(rep("Linear: Clin", length(lcv1)), rep("Linear: Clin and SOM Coords", length(som_lbv1)), rep("RF: Clin", length(rfcv1)), rep("RF: Clin and SOM Coords", length(som_rfbv1))), "error" = c(lcv1, som_lbv1, rfcv1, som_rfbv1) ) som_vol_data$model <- factor(som_vol_data$model, levels=c("Linear: Clin", "Linear: Clin and SOM Coords", "RF: Clin", "RF: Clin and SOM Coords")) compare_means(error~model, data = som_vol_data) my_comparisons <- list(c("Linear: Clin", "Linear: Clin and SOM Coords"), c("RF: Clin", "RF: Clin and SOM Coords")) ggplot(som_vol_data, aes(x = model, y = error))+ geom_boxplot(outlier.shape = NA) + stat_compare_means(comparisons = my_comparisons, method = "t.test", label.y = 0.0055, tip.length = 0.001) + ylim(0, 0.006)+ ggtitle("MSE for Models Predicting EGFR Volatility")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.