R/ranger_reg_plot.R

Defines functions boxplot_rel_predicted_train_vs_test plot_rel_predicted calc_rel_predicted plot_reg_feature_selection plot_test_perf_VS_rand plot_train_vs_test plot_perf_VS_rand plot_residuals plot_obs_VS_pred

Documented in boxplot_rel_predicted_train_vs_test calc_rel_predicted plot_obs_VS_pred plot_perf_VS_rand plot_reg_feature_selection plot_rel_predicted plot_residuals plot_test_perf_VS_rand plot_train_vs_test

#' @importFrom stats cor wilcox.test residuals smooth.spline fitted median
#' @importFrom plyr ldply
#' @importFrom rlang .data
#' @import ggplot2
#' @importFrom gridExtra arrangeGrob
#' @importFrom reshape2 melt
#' @importFrom utils write.table
#' @importFrom gtools rdirichlet


#' @title plot_obs_VS_pred
#' @description Plot a scatterplot of observed and predicted values from a ranger model.
#' @param y The numeric values for labeling data.
#' @param predicted_y The predicted values for y.
#' @param metric A regression performance metric (i.e., MAE, RMSE, MSE, R_squared, Adj_R_squared, or Separman_rho) showed in the scatter plot.
#' @param SampleIDs The sample ids in the data table.
#' @param prefix The prefix for the dataset in the training or testing.
#' @param target_field A string indicating the target field of metadata for regression.
#' @param span Controls the amount of smoothing for the default loess smoother in the scatterplot.
#' Smaller numbers produce wigglier lines, larger numbers produce smoother lines.
#' @param outdir The output directory.
#' @examples
#' set.seed(123)
#' x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209))),
#'             t(rmultinom(15, 75, c(.001,.1,.42,.18,.299)))))
#' y<- 1:60
#' rf_model<-rf.out.of.bag(x, y)
#' plot_obs_VS_pred(y, predicted_y=rf_model$predicted, prefix="train", target_field="age", outdir=NULL)
#' @author Shi Huang
#' @export
plot_obs_VS_pred <- function(y, predicted_y, SampleIDs=NULL, prefix="train", target_field="value", metric="MAE", span=1, outdir=NULL){
  df<-data.frame(y, predicted_y)
  if(nrow(df)==length(SampleIDs)){
    df<-data.frame(SampleIDs, df)
  }else if(length(SampleIDs)>0 & nrow(df)!=length(SampleIDs)){
    stop("Please make sure that sample IDs match with y or predicted y.")
  }
  perf_value<-get.reg.performance(predicted_y, y)[[metric]]
  label = paste(metric, ": ", as.character(round(perf_value, 2)), sep="")
  p<-ggplot(df, aes(x=.data$y, y=.data$predicted_y))+
    ylab(paste("Predicted ",target_field,sep=""))+
    xlab(paste("Observed ",target_field,sep=""))+
    geom_point(alpha=0.1)+
    geom_smooth(method="loess",span=span)+
    annotate(geom="text", x=Inf, y=Inf, label=label, color="grey60", vjust=2, hjust=2)+
    theme_bw()
  #coord_flip()+
  if(!is.null(outdir)){
    ggsave(filename=paste(outdir, prefix, ".", target_field, ".obs_vs_pred.scatterplot.pdf",sep=""), plot=p, height=4, width=4)
    sink(paste(outdir, prefix, ".", target_field, ".obs_vs_pred.results.xls",sep=""));cat("\t")
    write.table(df, quote=FALSE,sep="\t", row.names = FALSE);sink(NULL)
  }
  p
}
#' @title plot_residuals
#' @description Plot the residuals of observed and predicted values from a ranger model.
#' @param y The numeric values for labeling data.
#' @param predicted_y The predicted values for y.
#' @param SampleIDs The sample ids in the data table.
#' @param prefix The prefix for the dataset in the training or testing.
#' @param target_field A string indicating the target field of the metadata for regression.
#' @param outdir The output directory.
#' @examples
#' set.seed(123)
#' x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209))),
#'             t(rmultinom(15, 75, c(.001,.1,.42,.18,.299)))))
#' y<- 1:60
#' rf_model<-rf.out.of.bag(x, y)
#' plot_residuals(y, predicted_y=rf_model$predicted, prefix="train", target_field="age")
#' @author Shi Huang
#' @export
plot_residuals <- function(y, predicted_y, SampleIDs=NULL, prefix="train", target_field="value", outdir=NULL){
  df<-data.frame(y, predicted_y, rsdl=y-predicted_y)
  if(nrow(df)==length(SampleIDs)){
    df<-data.frame(SampleIDs, df)
  }else if(length(SampleIDs)>0 & nrow(df)!=length(SampleIDs)){
    stop("Please make sure that sample IDs match with y or predicted y.")
  }
  p<-ggplot(df, aes(x=.data$y, y=.data$rsdl))+
    ylab(paste("Residuals of prediceted ",target_field,sep=""))+
    xlab(paste("Observed ",target_field,sep=""))+
    geom_point(alpha=0.1)+
    geom_hline(yintercept=0)+
    #geom_smooth(method="loess",span=span)+
    theme_bw()
  if(!is.null(outdir)){
  ggsave(filename=paste(outdir, prefix, ".", target_field, ".obs_vs_residuals_of_pred.scatterplot.pdf",sep=""), plot=p, height=4, width=4)
  sink(paste(outdir, prefix, ".", target_field, ".obs_vs_pred.results.xls",sep=""))
  cat("\t")
  write.table(df, quote=FALSE,sep="\t", row.names = FALSE);sink(NULL)
  }
  invisible(p)
}

#' @title plot_perf_VS_rand
#' @description This outputs a histogram and a p-value showing if the performance of a real regression model
#' significantly better than null models.
#' @param x The train data.
#' @param y The numeric labeling data.
#' @param nfolds The number of folds in the cross validation. If nfolds > length(y)
#' or nfolds==-1, uses leave-one-out cross-validation. If nfolds was a factor, it means customized folds
#' (e.g., leave-one-group-out cv) were set for CV.
#' @param predicted_y The predicted values for y.
#' @param n_features The number of features in the training data.
#' @param prefix The prefix for the dataset in the training or testing.
#' @param target_field A string indicating the target field of the metadata for regression.
#' @param metric The regression performance metric applied, including MAE, RMSE, MSE, R_squared, Adj_R_squared, or Separman_rho.
#' @param permutation The permutation times for a random guess of regression performance.
#' @param outdir The output directory.
#' @examples
#' set.seed(123)
#' x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209))),
#'             t(rmultinom(15, 75, c(.001,.1,.42,.18,.299)))))
#' y<- 1:60
#' rf_model<-rf.out.of.bag(x, y)
#' p<-plot_perf_VS_rand(x=x, y=y, predicted_y=rf_model$predicted, prefix="train", nfolds=5,
#' permutation=100, metric="MAE", target_field="age", n_features=5)
#' p
#' @author Shi Huang
#' @export
plot_perf_VS_rand<-function(x, y, predicted_y, prefix="train", target_field, nfolds,
                            metric="MAE", permutation=100, n_features=NA, outdir=NULL){
  # rand_perf <- function(y, permutation.=permutation, metric.=metric,
  #                       n_features.=n_features){
  #   set.seed(123)
  #   rand_y_mat <-replicate(permutation, sample(y, replace = FALSE))
  #   rand_perf_values <- apply(rand_y_mat, 2, function(x) get.reg.performance(x, y, n_features)[[metric]])
  #   rand_perf_values
  # }
  # rand_perf_values=rand_perf(y)
  shuffle_y_perf <- function(x, y, permutation.=permutation, metric.=metric, nfolds.=nfolds,
                        n_features.=n_features){
    set.seed(123)
    rand_y_mat <-replicate(permutation, sample(y, replace = FALSE))
    rand_perf_values <- apply(rand_y_mat, 2, function(rand_y){
      rand_rf <- rf.cross.validation(x, rand_y, nfolds=nfolds)
      perf <- get.reg.performance(rand_y, rand_rf$predicted, n_features)[[metric]]
    })
    rand_perf_values
  }
  emp_p_value <- function(perf_value, rand_perf_values){
    k<-length(rand_perf_values)
    p<-(sum(abs(perf_value >= rand_perf_values))+1)/(k+1)
    p
  }
  rand_perf_values <- shuffle_y_perf(x, y, permutation = 100)
  perf_value<-get.reg.performance(predicted_y, y, n_features)[[metric]]
  perf_values<-data.frame(perf_value, rand_perf_values)
  emp_p_value<-emp_p_value(perf_value, rand_perf_values)
  # histogram
  label = paste(metric, ": ", as.character(round(perf_value, 2)), "\np-value = ", round(emp_p_value, 2), sep="")
  p<-ggplot(perf_values, aes(x=.data$rand_perf_values)) + geom_histogram(alpha=0.5) +
    xlab(metric)+
    ylab("count")+
    geom_vline(data=perf_values, aes(xintercept = .data$perf_value)) +
    annotate(geom="text", x=perf_value, y=Inf, label=label, color="red", vjust=2, hjust=0)+theme_bw()
  if(!is.null(outdir)){
    ggsave(filename=paste(outdir, prefix, ".", target_field, ".", metric,
                          "_vs_rand.histogram.pdf",sep=""), plot=p, height=4, width=4)
  }
  res <- list()
  res$emp_p_value <- emp_p_value
  res$perf_values <- perf_values
  res$plot <- p
  res
}

#' @title plot_train_vs_test
#' @description Plot the residuals of observed and predicted values from a ranger model.
#' @param train_y The numeric labels for training data.
#' @param predicted_train_y The predicted values for training data.
#' @param train_SampleIDs The sample ids in the train data.
#' @param test_y The numeric labels for testing data.
#' @param predicted_test_y The predicted values for test data.
#' @param test_SampleIDs The sample ids in the test data.
#' @param train_prefix The prefix for the dataset in the training data.
#' @param test_prefix The prefix for the dataset in the testing data.
#' @param train_target_field A string indicating the target field of the training metadata for regression.
#' @param test_target_field A string indicating the target field of the testing metadata for regression.
#' @param outdir The output directory.
#' @examples
#' set.seed(123)
#' x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209))),
#'             t(rmultinom(15, 75, c(.001,.1,.42,.18,.299)))))
#' y<- 1:60
#' rf_model<-rf.out.of.bag(x, y)
#' plot_perf_VS_rand(y=y, predicted_y=rf_model$predicted, prefix="train",
#' permutation=100, metric="MAE", target_field="age")
#' @author Shi Huang
#' @export
plot_train_vs_test<-function(train_y, predicted_train_y, test_y, predicted_test_y, train_SampleIDs=NULL, test_SampleIDs=NULL,
                             train_prefix="train", test_prefix="test", train_target_field, test_target_field, outdir=NULL){
  train.pred<-data.frame(value=train_y,predicted_value=predicted_train_y)
  if(nrow(train.pred)==length(train_SampleIDs)){
    train.pred<-data.frame(SampleIDs=train_SampleIDs, train.pred)
  }else if(length(train_SampleIDs)>0 & nrow(train.pred)!=length(train_SampleIDs)){
    stop("Please make sure that sample IDs match with y or predicted y in the train data.")
  }
  test.pred<-data.frame(value=test_y,predicted_value=predicted_test_y)
  if(nrow(test.pred)==length(test_SampleIDs)){
    test.pred<-data.frame(SampleIDs=test_SampleIDs, test.pred)
  }else if(length(test_SampleIDs)>0 & nrow(test.pred)!=length(test_SampleIDs)){
    stop("Please make sure that sample IDs match with y or predicted y in the test data.")
  }
  data_name<-c(rep(train_prefix,nrow(train.pred)),rep(test_prefix,nrow(test.pred)))
  pred<-data.frame(data=data_name,rbind(train.pred,test.pred))
  pred$data<-factor(pred$data,levels=c(train_prefix,test_prefix),ordered=TRUE)
  l<-levels(pred$data); l_sorted<-sort(levels(pred$data))
  #Mycolor <- c("#D55E00", "#0072B2")
  #if(identical(order(l), order(l_sorted))){Mycolor=Mycolor }else{Mycolor=rev(Mycolor)}
  p<-ggplot(pred,aes(x=.data$value,y=.data$predicted_value))+
    ylab(paste("Predicted ",train_target_field,sep=""))+
    xlab(paste("Observed ",train_target_field,sep=""))+
    geom_point(aes(color=.data$data), alpha=0.1)+
    geom_smooth(aes(color=.data$data), method="loess",span=1)+
    #scale_color_manual(values = Mycolor)+
    theme_bw() +
    facet_wrap(~.data$data)+
    theme(legend.position="none")
  #coord_flip()+
  if(!is.null(outdir)){
  ggsave(filename=paste(outdir, train_prefix,"-",test_prefix,".",test_target_field, ".train_test_ggplot.pdf",sep=""),plot=p, height=3, width=6)
  sink(paste(outdir, train_prefix,"-",test_prefix,".",test_target_field, ".train_test_results.xls",sep=""));
  cat("\t");write.table(pred, quote=FALSE,sep="\t", row.names = FALSE);sink(NULL)
  }
  invisible(p)
}


#' @title plot_test_perf_VS_rand
#' @description This outputs a histogram and a p-value showing if the performance of a real regression model
#' significantly better than null models.
#' @param rf_model A trained rf model object, which should be generated from /code{rf.cross.validation} or /code{rf.out.of.bag}.
#' @param newy The data label of new data.
#' @param newx A data.matrix or data.frame with the new data for rf model testing.
#' @param n_features The number of features in the training data.
#' @param prefix The prefix for the dataset in the training or testing.
#' @param target_field A string indicating the target field of the metadata for machine-learning analysis.
#' @param metric The regression performance metric applied, including MAE, RMSE, MSE, R_squared, Adj_R_squared, or Separman_rho.
#' @param permutation The permutation times for a random guess of regression performance.
#' @param outdir The output directory.
#' @examples
#' set.seed(123)
#' x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209))),
#'             t(rmultinom(15, 75, c(.001,.1,.42,.18,.299)))))
#' y<- 1:60
#'
#' newx <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.001,.1,.42,.18,.299)))))
#' newy<- 4:33
#' rf_model<-rf.out.of.bag(x, y)
#' p<-plot_perf_VS_rand(x=x, y=y, predicted_y=rf_model$predicted, prefix="train", nfolds=5,
#' permutation=100, metric="MAE", target_field="age", n_features=5)
#' p
#' p_test<-plot_test_perf_VS_rand(rf_model, newx, newy, permutation=100,
#'                                metric="MAE", target_field="age", n_features=5)
#' p_test
#' @author Shi Huang
#' @export
plot_test_perf_VS_rand<-function(rf_model, newx, newy, prefix="test", target_field="",
                            metric="MAE", permutation=1000, n_features=NA, outdir=NULL){

  shuffle_y_test_perf <- function(rf_model, newx, newy, permutation.=permutation, metric.=metric,
                             n_features.=n_features){
    set.seed(123)
    rand_y_mat <-replicate(permutation, sample(newy, replace = FALSE))
    rand_perf_values <- apply(rand_y_mat, 2, function(rand_y){
      predicted_newy <- predict(rf_model$rf.model, newx, type="response")$predictions
      perf <- get.reg.performance(rand_y, predicted_newy)[[metric]]
    })
    rand_perf_values
  }
  emp_p_value <- function(perf_value, rand_perf_values){
    k<-length(rand_perf_values)
    p<-(sum(abs(perf_value >= rand_perf_values))+1)/(k+1)
    p
  }
  rand_perf_values <- shuffle_y_test_perf(rf_model, newx, newy)
  predicted_newy <- predict(rf_model$rf.model, newx, type="response")$predictions
  perf_value<-get.reg.performance(predicted_newy, newy)[[metric]]
  perf_values<-data.frame(perf_value, rand_perf_values)
  emp_p_value<-emp_p_value(perf_value, rand_perf_values)
  # histogram
  label = paste(metric, ": ", as.character(round(perf_value, 2)), "\np-value = ", round(emp_p_value, 2), sep="")
  p<-ggplot(perf_values, aes(x=.data$rand_perf_values)) + geom_histogram(alpha=0.5) +
    xlab(metric)+
    ylab("count")+
    geom_vline(data=perf_values, aes(xintercept = .data$perf_value)) +
    annotate(geom="text", x=perf_value, y=Inf, label=label, color="red", vjust=2, hjust=0)+theme_bw()
  if(!is.null(outdir)){
    ggsave(filename=paste(outdir, prefix, ".", target_field, ".", metric,
                          "_vs_rand.histogram.pdf",sep=""), plot=p, height=4, width=4)
  }
  res <- list()
  res$emp_p_value <- emp_p_value
  res$perf_values <- perf_values
  res$plot <- p
  res
}


#' @title plot_reg_feature_selection
#' @description Plot the regression performance against the reduced number of features used in the modeling.
#' @param x The data frame or data matrix for model training.
#' @param y The numeric values for labeling data.
#' @param nfolds The number of folds in the cross-validation for each feature set.
#' @param unit The unit of numeric metadata variables that can be printed in the output figure.
#' @param rf_reg_model The rf regression model from \code{rf.out.of.bag}
#' @param metric The regression performance metric applied.
#' This must be one of "MAE", "RMSE", "MSE", "MAPE".
#' @param outdir The output directory.
#' @examples
#' set.seed(123)
#' require("gtools")
#' n_features <- 100
#' prob_vec <- rdirichlet(5, sample(n_features))
#' x <- data.frame(rbind(t(rmultinom(7, 7*n_features, prob_vec[1, ])),
#'             t(rmultinom(8, 8*n_features, prob_vec[2, ])),
#'             t(rmultinom(15, 15*n_features, prob_vec[3, ])),
#'             t(rmultinom(15, 15*n_features, prob_vec[4, ])),
#'             t(rmultinom(15, 15*n_features, prob_vec[5, ]))))
#' y<- 1:60
#' rf_reg_model<-rf.out.of.bag(x, y)
#' rf_reg_model<-rf.cross.validation(x, y)
#' fs_summ <- plot_reg_feature_selection(x, y, rf_reg_model, metric="MAE", outdir=NULL)
#' @author Shi Huang
#' @export
plot_reg_feature_selection <- function(x, y, rf_reg_model, nfolds=5, metric="MAE",
                                       unit=NA, outdir=NULL){
  if(class(rf_reg_model)=="rf.cross.validation"){
    rank_mat <- apply(rf_reg_model$importances, 2, function(x){rank(-x, na.last = "keep")})
    rf_imp_rank <- rank(apply(rank_mat, 1, median), na.last = "keep")
    }else if(class(rf_reg_model)=="rf.out.of.bag"){
      rf_imp_rank<-rank(-(rf_reg_model$importances), na.last = "keep")
    }else{
      stop("The class of input rf model should be rf.out.of.bag or rf.cross.validation.")
    }
  max_n<-max(rf_imp_rank, na.rm = TRUE)
  n_total_features<-ncol(x)
  n_features<-c(2, 4, 8, 16, 32, 64, 128, 256, 512, 1024)
  min_dist_idx <- which.min(abs(n_features-n_total_features))
  maginal_idx <- ifelse(n_features[min_dist_idx]>n_total_features, min_dist_idx-1, min_dist_idx)
  n_features<-n_features[1:maginal_idx]
  top_n_perf<-matrix(NA, ncol=6, nrow=length(n_features)+1)
  colnames(top_n_perf)<-c("n_features", "MSE", "RMSE", "MAE", "MAPE", "Spearman_rho")
  rownames(top_n_perf)<-top_n_perf[,1]<-c(n_features, max_n)
  top_n_rf_list <-vector(mode="list", length = length(n_features))
  names(top_n_rf_list) <- n_features
  for(i in 1:length(n_features)){
    idx<-which(rf_imp_rank<=n_features[i])
    #top_n_features<-names(rf_imp_rank[idx])
    x_n<-x[, idx]
    y_n<-y
    top_n_rf_list[[i]] <- top_n_rf<-rf.cross.validation(x_n, y_n, nfolds=5, ntree=500)
    top_n_rf_perf<-get.reg.performance(top_n_rf$predicted, y_n)
    top_n_perf[i, 1]<-n_features[i]
    top_n_perf[i, 2]<-top_n_rf_perf$MSE
    top_n_perf[i, 3]<-top_n_rf_perf$RMSE
    top_n_perf[i, 4]<-top_n_rf_perf$MAE
    top_n_perf[i, 5]<-top_n_rf_perf$MAPE
    top_n_perf[i, 6]<-top_n_rf_perf$Spearman_rho
  }

  all_rf_perf<-get.reg.performance(rf_reg_model$predicted, y)
  top_n_perf[length(n_features)+1, ]<-as.numeric(c(max_n, all_rf_perf[c("MSE", "RMSE", "MAE", "MAPE", "Spearman_rho")]))
  top_n_perf<-data.frame(top_n_perf)
  if(metric=="Spearman_rho"){
    best_idx <- which.max(top_n_perf[, metric])
    }else{
    best_idx <- which.min(top_n_perf[, metric])
    }
  best_n_features <- top_n_perf$n_features[best_idx]
  breaks<-top_n_perf$n_features
  y_label <- ifelse(is.na(unit), metric, paste(metric, " (", unit ,")", sep=""))
  p<-ggplot(top_n_perf, aes(x=.data$n_features, y=get(metric))) +
    xlab("# of features used")+
    ylab(y_label)+
    scale_x_continuous(trans = "log", breaks=breaks)+
    geom_point() + geom_line()+
    geom_vline(xintercept = best_n_features, linetype="dashed", color="blue")+
    geom_text(aes(x=best_n_features, y=top_n_perf[best_idx, metric], label=best_n_features),
              color="blue", vjust=-1)+
    theme_bw()+
    theme(axis.line = element_line(color="black"),
          axis.title = element_text(size=18),
          strip.background = element_rect(colour = "white"),
          panel.border = element_blank())
  if(!is.null(outdir)){
  ggsave(filename=paste(outdir,"rf__",metric,"__top_rankings.scatterplot.pdf",sep=""), plot=p, width=5, height=4)
  }
  res <- list()
  res$top_n_perf <- top_n_perf
  res$best_n_features <- best_n_features
  res$top_n_rf <-top_n_rf_list
  res
}

#' @title calc_rel_predicted
#' @description Calculate the relative predicted values to the spline fit.
#' @param train_y The numeric labels for training data.
#' @param predicted_train_y The predicted values for training data.
#' @param train_SampleIDs The sample ids in the train data.
#' @param test_y The numeric labels for testing data.
#' @param predicted_test_y The predicted values for test data.
#' @param test_SampleIDs The sample ids in the test data.
#' @param train_prefix The prefix for the dataset in the training data.
#' @param test_prefix The prefix for the dataset in the testing data.
#' @param train_target_field A string indicating the target field of the training metadata for regression.
#' @param test_target_field A string indicating the target field of the testing metadata for regression.
#' @param outdir The output directory.
#' @examples
#' set.seed(123)
#' train_x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209))),
#'             t(rmultinom(15, 75, c(.001,.1,.42,.18,.299)))))
#' train_y<- 1:60
#' test_x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209)))))
#' test_y<- 1:45
#' train_rf_model<-rf.out.of.bag(train_x, train_y)
#' predicted_test_y<-predict(train_rf_model$rf.model, test_x)$predictions
#' calc_rel_predicted(train_y, predicted_train_y=train_rf_model$predicted)
#' calc_rel_predicted(train_y=train_y, predicted_train_y=train_rf_model$predicted,
#'                    #train_SampleIDs=as.character(1:60),
#'                    test_y=test_y, predicted_test_y=predicted_test_y,
#'                    #test_SampleIDs=as.character(1:45),
#'                    train_target_field="y",  test_target_field="test_y", outdir=NULL)
#' calc_rel_predicted(train_y=train_y, predicted_train_y=train_rf_model$predicted,
#'                    train_SampleIDs=as.character(1:60),
#'                    test_y=test_y, predicted_test_y=predicted_test_y,
#'                    test_SampleIDs=as.character(1:45),
#'                    train_target_field="y",  test_target_field="test_y", outdir=NULL)
#' @author Shi Huang
#' @export
calc_rel_predicted<-function(train_y, predicted_train_y, train_SampleIDs=NULL,
                             test_y=NULL, predicted_test_y=NULL, test_SampleIDs=NULL,
                             train_prefix="train", test_prefix="test",
                             train_target_field="y", test_target_field="y", outdir=NULL){
  spl_train <- smooth.spline(train_y, predicted_train_y)
  train_relTrain <- residuals(spl_train)
  train_fittedTrain <- fitted(spl_train)
  relTrain_data<-train_relTrain_data <- data.frame(y=train_y, predicted_y=predicted_train_y,
                                                   rel_predicted_y=train_relTrain,
                                                   fitted_predicted_y=train_fittedTrain)
  if(!is.null(train_SampleIDs)){
    if(nrow(relTrain_data)!=length(train_SampleIDs)){
      stop("Please make sure that sample IDs match with y or predicted y in the train data.")
    }else{
      relTrain_data<-train_relTrain_data<-data.frame(SampleIDs=train_SampleIDs, relTrain_data)
    }
  }
  if(!is.null(outdir)){
    sink(paste(outdir, train_prefix,".Relative_",train_target_field,".results.xls",sep=""))
    cat("\t")
    write.table(relTrain_data,quote=FALSE,sep="\t", row.names = FALSE);sink(NULL)
  }

  if(!is.null(test_y) & !is.null(predicted_test_y)){
    test_fitted<-predict(spl_train, test_y)$y
    test_relTrain <- predicted_test_y - test_fitted
    test_relTrain_data <- data.frame(y=test_y, predicted_y=predicted_test_y,
                                     rel_predicted_y=test_relTrain,
                                     fitted_predicted_y=test_fitted)
    if(!is.null(test_SampleIDs)){
      if(nrow(test_relTrain_data)!=length(test_SampleIDs)){
        stop("Please make sure that sample IDs match with y or predicted y in the test data.")
        }else{
          test_relTrain_data<-data.frame(SampleIDs=test_SampleIDs, test_relTrain_data)
        }
    }
    relTrain_data<-rbind(train_relTrain_data, test_relTrain_data)
    DataSet <- factor(c(rep(train_prefix,length(train_relTrain)), rep(test_prefix,length(test_relTrain)) ))
    relTrain_data<-data.frame(relTrain_data, DataSet)

    if(!is.null(outdir)){
      sink(paste(outdir, train_prefix,"-",test_prefix,".Relative_",train_target_field,".results.xls",sep=""));
      cat("\t");
      write.table(relTrain_data,quote=FALSE,sep="\t", row.names = FALSE);sink(NULL)
    }
  }
    return(relTrain_data)
}

#' @title plot_rel_predicted
#' @description Calculate the relative predicted values to the spline fit in the training data.
#' @param relTrain_data The output dataframe of \code{calc_rel_predicted}.
#' @param prefix The prefix of a train dataset.
#' @param target_field A string indicating the target field in the metadata for regression.
#' @param outdir The output directory.
#' @examples
#' set.seed(123)
#' train_x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209))),
#'             t(rmultinom(15, 75, c(.001,.1,.42,.18,.299)))))
#' train_y<- 1:60
#' test_x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209)))))
#' test_y<- 1:45
#' train_rf_model<-rf.out.of.bag(train_x, train_y)
#' predicted_test_y<-predict(train_rf_model$rf.model, test_x)$predictions
#' relTrain_data<-calc_rel_predicted(train_y, train_rf_model$predicted, test_y, predicted_test_y)
#' plot_rel_predicted(relTrain_data)
#' @author Shi Huang
#' @export
plot_rel_predicted <- function(relTrain_data, prefix="train", target_field="value", outdir=NULL){
  if(length(prefix)==1){
    relTrain_data<-subset(relTrain_data, .data$DataSet==prefix)
    p<-ggplot(relTrain_data, aes(x=.data$y, y=.data$rel_predicted_y))+
      ylab(paste("Relative prediceted ",target_field,sep=""))+
      xlab(paste("Observed ",target_field,sep=""))+
      geom_point(alpha=0.1)+
      geom_hline(yintercept=0)+
      #geom_smooth(method="loess",span=span)+
      theme_bw()
    #coord_flip()+
  }else{
    p<-ggplot(relTrain_data, aes(x=.data$y, y=.data$rel_predicted_y, color=.data$DataSet))+
      ylab(paste("Relative prediceted ",target_field,sep=""))+
      xlab(paste("Observed ",target_field,sep=""))+
      geom_point(alpha=0.1)+
      geom_hline(yintercept=0)+
      #geom_smooth(method="loess",span=span)+
      theme_bw()
    prefix=paste(prefix, collapse = "-")
  }
  ggsave(filename=paste(outdir, prefix, ".", target_field, ".obs_vs_relative_pred.scatterplot.pdf",sep=""), plot=p, height=4, width=4)
  invisible(p)
}

#' @title boxplot_rel_predicted_train_vs_test
#' @description Make the boxplot the relative predicted values in both train and test datasets.
#' @param relTrain_data The output dataframe of \code{calc_rel_predicted}.
#' @param train_prefix The prefix for the dataset in the training data.
#' @param test_prefix The prefix for the dataset in the testing data.
#' @param train_target_field A string indicating the target field of the training metadata for regression.
#' @param outdir The output directory.
#' @examples
#' set.seed(123)
#' train_x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209))),
#'             t(rmultinom(15, 75, c(.001,.1,.42,.18,.299)))))
#' train_y<- 1:60
#' test_x <- data.frame(rbind(t(rmultinom(7, 75, c(.201,.5,.02,.18,.099))),
#'             t(rmultinom(8, 75, c(.201,.4,.12,.18,.099))),
#'             t(rmultinom(15, 75, c(.011,.3,.22,.18,.289))),
#'             t(rmultinom(15, 75, c(.091,.2,.32,.18,.209)))))
#' test_y<- 1:45
#' train_rf_model<-rf.out.of.bag(train_x, train_y)
#' predicted_test_y<-predict(train_rf_model$rf.model, test_x)$predictions
#' relTrain_data<-calc_rel_predicted(train_y, predicted_train_y=train_rf_model$predicted,
#'                                   train_SampleIDs=NULL,
#'                                   test_y, predicted_test_y,
#'                                   test_SampleIDs=NULL)
#' boxplot_rel_predicted_train_vs_test(relTrain_data)
#' @author Shi Huang
#' @export
boxplot_rel_predicted_train_vs_test<-function(relTrain_data, train_target_field="value",
                                              train_prefix="train", test_prefix="test", outdir=NULL){
    NoTestDataset<-all(grepl("DataSet", colnames(relTrain_data))==FALSE)
    if(NoTestDataset) stop("Test dataset should be included for residuals comparison between train and test datasets!")
    p_w<-formatC(stats::wilcox.test(rel_predicted_y~DataSet, data = relTrain_data)$p.value,digits=4,format="g")
    # l<-levels(relTrain_data$DataSet); l_sorted<-sort(levels(relTrain_data$DataSet))
    # Mycolor <- rep(c("#D55E00", "#0072B2"), length.out=length(l))
    # if(identical(order(l), order(l_sorted))){
    #   Mycolor=Mycolor; l_ordered=l
    # }else{Mycolor=rev(Mycolor); l_ordered=l_sorted}
    p<-ggplot(relTrain_data, aes(x=.data$DataSet, y=.data$rel_predicted_y)) +
      geom_violin(aes(color=.data$DataSet))+
      geom_boxplot(outlier.shape = NA, width=0.4)+
      geom_jitter(position=position_jitter(width=0.2),alpha=0.1) + # aes(color=DataSet),
      geom_hline(yintercept=0)+
      theme_bw()+
      ggtitle(paste("Wilcoxon Rank Sum Test:\n P=", p_w, sep=""))+
      xlab("Data sets") +
      ylab(paste("Relative predicted", train_target_field))+
      theme(legend.position="none")
    # p<-p+scale_color_manual(values = Mycolor, labels=l_ordered)
    ggsave(filename= paste(outdir, train_prefix,"-",test_prefix, ".Relative_",train_target_field,".boxplot.pdf",sep=""), width=3, height=4)
    invisible(p)
}
shihuang047/crossRanger documentation built on Feb. 7, 2023, 10:03 p.m.