#' dataprocess
#' dataprocess for TCGA
#' By default, this function you can filter some dirty data to prepare for kmplot.
#' @param clinicaldata The data can be loaded by the results of dataexplore. You can load("survival_inputdata.Rdata") to get both of datasets.
#' @param exprSet The data can be loaded by the results of dataexplore. You can load("survival_inputdata.Rdata") to get both of datasets.
#' @param x_axis This argument is the x axis for time.
#' @param y_axis This argument is the y axis for status.
#' @return There is a clean data after dataprocess.
#' @export
#' @import ggplot2 stringr ggpubr ggstatsplot
#' @author Wei Zhou <247328181@@qq.com>
#' @examples
#' \dontrun{
#' ## Plot the kmplot automatically
#' kmplot()
#' ## Plot kmfit using "specified" style
#' kmplot(plotType = "specified")
#' }

dataprocess <- function(clinicaldata = NULL, exprSet = NULL, x_axis = NULL, y_axis = NULL) {

  ## 0. prepare environment and load libraries
  # rm(list = ls())
  # gc()
  # set.seed(12345)
  # graphics.off()
  # options(stringsAsFactors = FALSE)
  # load packages
  # pkgs <- c("ggplot2", "stringr", "ggpubr", "ggstatsplot", "export")
  # installpkgs <- function(pkgs){
  #   new.pkgs <- pkgs[!(pkgs %in% installed.packages()[ , "Package"])]
  #   if (length(new.pkgs))
  #    BiocManager::install(new.pkgs, ask = F, update = F)
  #   sapply(pkgs, require, character.only = T)
  # }
  # installpkgs(pkgs)
  # lapply(pkgs, library, character.only = T)

  ## 2. data preproceeding
  # load data
  if (is.null(clinicaldata) || is.null(exprSet)) {
  } else {
    print("Please guarantee your two files are choose_clinicaldata and exprSet, respectively!")
    cat("***Notation: Or you will load the survival_inputdata.Rdata on your own!")

  # view the clinical data
  clinicaldata_view <- as.matrix(colnames(myclinicaldata))
  choose_columns <- c(clinicaldata_view[which(!is.na(str_extract(clinicaldata_view, "DFS_MONTHS")))],
                      clinicaldata_view[which(!is.na(str_extract(clinicaldata_view, "DFS_STATUS")))],
                      clinicaldata_view[which(!is.na(str_extract(clinicaldata_view, "DSS_MONTHS")))],
                      clinicaldata_view[which(!is.na(str_extract(clinicaldata_view, "DSS_STATUS")))],
                      clinicaldata_view[which(!is.na(str_extract(clinicaldata_view, "OS_MONTHS")))],
                      clinicaldata_view[which(!is.na(str_extract(clinicaldata_view, "OS_STATUS")))],
                      clinicaldata_view[which(!is.na(str_extract(clinicaldata_view, "PFS_MONTHS")))],
                      clinicaldata_view[which(!is.na(str_extract(clinicaldata_view, "PFS_STATUS")))])
  cat("This clinical dataset only has the following data: \n***Notation: x_axis presents time, y_axis presents status; please choose the corresponding data!\n")
  for (i in 1:length(choose_columns)) {
    cat(c(i, ": ", choose_columns[i], "\n"))

  # read the clinical information
  # choose x_axis
  if (is.null(x_axis)) {
        ANSWER <- readline("Please input the x_axis number (time): ")
        num <- as.numeric(ANSWER)
        if(is.na(num) || num <= 0 || num > length(choose_columns)){
          print("Please type right number format!")
        } else if (num > 0 && num <= length(choose_columns)) {
      x_axis <- choose_columns[num]
  } else {
    if (! x_axis %in% choose_columns) {
      cat("Warning! The input of x_axis is not in the clinical dataset!\n***Please type right format!\n")
      x_axis = NULL
      stop("Input again! Or you can eliminate x_axis and follow the tips.")
  # choose y_axis
  if (is.null(y_axis)) {
        ANSWER <- readline("Please input the y_axis number (status): ")
        num <- as.numeric(ANSWER)
        if(is.na(num) || num <= 0 || num > length(choose_columns)){
          print("Please type right number format!")
        } else if (num > 0 && num <= length(choose_columns)) {
          if (x_axis != choose_columns[num]) {
          } else {
            print("y_axis can't be the same with x_axis!")
      y_axis <- choose_columns[num]
  } else if (x_axis == y_axis) {
    stop("x_axis can't be the same with y_axis!")
  } else {
    if (! y_axis %in% choose_columns) {
      cat("Warning! The input of y_axis is not in the clinical dataset!\n***Please type right format!\n")
      x_axis = NULL
      stop("Input again! Or you can eliminate y_axis and follow the tips.")

  # filter the data
  choose_column = c(x_axis, y_axis)
  choose_clinicaldata = myclinicaldata[ , choose_column]
  dat1 <- choose_clinicaldata[!is.na(choose_clinicaldata[ , 1]), ]
  dat2 <- cbind(dat1, exprSet[rownames(dat1), ])
  geneName <- names(exprSet)
  colnames(dat2)[3] <- geneName
  # save the filtered data
      ANSWER <- readline("Save the clinical data and related exprset data[y/n]: ")
      num <- trimws(tolower(ANSWER), which = c("both", "left", "right"), whitespace = "[ \t\r\n]")
      if (num == "y" || num == "yes") {
        print("Save successfully!")
        filename = paste0(geneName, "_clinical_expr_data_", Sys.Date(), ".csv")
        write.csv(dat2, file = filename)
      } else if (num == "n" || num == "no") {
      } else {
        print("Wrong format! Please type y or n!")

  # expressed genes plot
  p <- ggboxplot(dat2, x = y_axis, y = geneName, color = y_axis, palette = "jco", add = "jitter")
  p + stat_compare_means(method = "t.test")
  group <- paste0(geneName, "_group")
  dat2$group = ifelse(dat2[ , geneName] > median(dat2[ , names(exprSet)]), 'high', 'low')
  # dat2$group = ifelse(dat2[ , names(exprSet)] > quantile(dat2[ , names(exprSet)])[4], 'high', 'low')
  # save the data
  save(dat2, file = "../data/kmplotdata.Rdata")
  # plot the expression img
  dat2$geneName = dat2$HTRA1
  ggbetweenstats(data = dat2, x = group, y = geneName, xlab = "Patient group", ylab = paste0(geneName, "_expression"))
  # save the image
  # if(interactive()){
  #   repeat{
  #     ANSWER <- readline("Save the expression picture?[y/n]: ")
  #     num <- trimws(tolower(ANSWER), which = c("both", "left", "right"), whitespace = "[ \t\r\n]")
  #     if (num == "y" || num == "yes") {
  #       print("Save successfully!")
  #       filename = paste0(geneName, "_clinical_expr_data_", Sys.Date(), ".pptx")
  #       graph2ppt(file = filename, width = 7, height = 5)
  #       break;
  #     } else if (num == "n" || num == "no") {
  #       break;
  #     } else {
  #       print("Wrong format! Please type y or n!")
  #     }
  #   }
  # }


#       Musician: Resonance  #
#           Date: 2020/03/16 #
# Revised author: Resonance  #
#       1st Time: 2020/03/17 #
#       2nd Time: 2020/05/27 #
#       3rd Time: 2020/06/03 #
