R/ABANDONED.R

# Legacy codes

if(F){

  # cvFitOneModel
  cvFitOneModel_legacy <- function(Xbin, Ybin,
                                   params=list(
                                     # xgb.cv only
                                     nfold=5,
                                     nrounds = 100,
                                     # xgb.cv & xgboost
                                     max_depth = 10,
                                     eta = 0.5,
                                     nthread = 5,
                                     colsample_bytree = 1,
                                     min_child_weight = 1
                                   ),
                                   breakVec=c(0, 0.25, 0.5, 0.75, 1.0),
                                   genes,
                                   verbose = F){


    # Test
    if(F){

      # Example
      library(GSClassifier); library(xgboost)
      testData <- readRDS(system.file("extdata", "testData.rds", package = "GSClassifier"))
      expr <- testData$PanSTAD_expr_part
      design <- testData$PanSTAD_phenotype_part
      modelInfo <- modelData(
        design,
        id.col = "ID",
        variable = c("platform", "PAD_subtype"),
        Prop = 0.1,
        seed = 145
      )
      Xs <- expr[,modelInfo$Data$Train$ID]
      y <- modelInfo$Data$Train
      y <- y[colnames(Xs),]
      Ys <- ifelse(y$PAD_subtype == 'PAD-I',1,ifelse(y$PAD_subtype == 'PAD-II',2,ifelse(y$PAD_subtype == 'PAD-III',3,ifelse(y$PAD_subtype == 'PAD-IV',4,NA)))); table(Ys)/length(Ys)
      PADi <- readRDS(system.file("extdata", paste0('PAD.train_20220916.rds'), package = "GSClassifier"))
      geneSet <- PADi$geneSet
      res <- trainDataProc(
        Xmat = Xs,
        Yvec = Ys,
        geneSet = geneSet,
        subtype = 2,
        ptail = 0.5,
        breakVec = c(0, 0.25, 0.5, 0.75, 1)
      )

      # Data
      Xbin <- res$dat$Xbin
      Ybin <- res$dat$Ybin
      genes <- res$dat$Genes
      params <- list(
        # xgboost & xgb.cv
        nfold = 5,
        nrounds = 100,

        # xgboost
        max_depth = 10,
        colsample_bytree = 1,
        min_child_weight = 1,
        eta = 0.5,
        gamma = 0.25,
        subsample = 0.7
      )
      breakVec=c(0, 0.25, 0.5, 0.75, 1.0)
      verbose = T

    }

    dtrain <- xgb.DMatrix(Xbin, label = Ybin)

    # xgb.cv

    # 2022-09-15  : WARNING: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

    # 2022-09-16: WARNING: If you are loading a serialized model (like pickle in Python, RDS in R) generated by
    # older XGBoost, please export the model by calling `Booster.save_model` from that version
    # first, then load it back in current version. See: https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
    # for more details about differences between saving model and serializing.

    for(i in 1:10000){
      x <- tryCatch(
        cvRes <- xgb.cv(
          params = params,
          nrounds = params$nrounds,
          nfold = params$nfold,
          data = dtrain,
          early_stopping_rounds=2,
          metrics = list("logloss", "auc"),
          objective = "binary:logistic",
          verbose = verbose
        ),
        error = function(e)e)
      if('message' %in% names(x)){
        if(verbose) LuckyVerbose('Attention! AUC: the dataset only contains pos or neg samples. Repeat xgb.cv')
        x_error <- x
      } else {
        cvRes <- x
        break
      }
    }

    if(verbose) LuckyVerbose('Best interation: ',cvRes$best_iteration)

    # xgboost via best interation
    bst <- xgboost(params = params,
                   data = Xbin,
                   label = Ybin,
                   nrounds = cvRes$best_iteration,
                   objective = "binary:logistic",
                   verbose = ifelse(verbose,1,0))

    return(list(bst=bst, breakVec=breakVec, genes=genes))
  }



  # nround不是xgb.cv选的。容易过拟合。
  cvFitOneModel <- function(Xbin, Ybin,genes,
                            params = list(
                              # xgb.cv only
                              nfold=5,
                              nrounds = 15,
                              # xgb.cv & xgboost
                              max_depth = 10,
                              eta = 0.5,
                              nthread = 5,
                              colsample_bytree = 1,
                              min_child_weight = 1
                            ),
                            breakVec=c(0, 0.25, 0.5, 0.75, 1.0),
                            seed = 102,
                            verbose = F){


    # Test
    if(F){

      # Example
      library(GSClassifier); library(xgboost)
      testData <- readRDS(system.file("extdata", "testData.rds", package = "GSClassifier"))
      expr <- testData$PanSTAD_expr_part
      design <- testData$PanSTAD_phenotype_part
      modelInfo <- modelData(
        design,
        id.col = "ID",
        variable = c("platform", "PAD_subtype"),
        Prop = 0.1,
        seed = 145
      )
      Xs <- expr[,modelInfo$Data$Train$ID]
      y <- modelInfo$Data$Train
      y <- y[colnames(Xs),]
      Ys <- ifelse(y$PAD_subtype == 'PAD-I',1,ifelse(y$PAD_subtype == 'PAD-II',2,ifelse(y$PAD_subtype == 'PAD-III',3,ifelse(y$PAD_subtype == 'PAD-IV',4,NA)))); table(Ys)/length(Ys)
      PADi <- readRDS(system.file("extdata", paste0('PAD.train_20220916.rds'), package = "GSClassifier"))
      geneSet <- PADi$geneSet
      res <- trainDataProc(
        Xmat = Xs,
        Yvec = Ys,
        geneSet = geneSet,
        subtype = 2,
        ptail = 0.5,
        breakVec = c(0, 0.25, 0.5, 0.75, 1)
      )

      # Data
      Xbin <- res$dat$Xbin
      Ybin <- res$dat$Ybin
      genes <- res$dat$Genes
      params <- list(
        # xgboost & xgb.cv
        nrounds = 15,

        # xgboost
        max_depth = 10,
        colsample_bytree = 1,
        min_child_weight = 1,
        eta = 0.5,
        gamma = 0.25,
        subsample = 0.7
      )
      breakVec=c(0, 0.25, 0.5, 0.75, 1.0)
      verbose = T
      seed = 102

    }

    # xgboost via best interation
    set.seed(seed)
    params_xg <- params[-match(c('nrounds'), names(params))]
    bst <- xgboost(params = params_xg,
                   data = Xbin,
                   label = Ybin,
                   objective = "binary:logistic",
                   nrounds = params$nrounds,
                   verbose = ifelse(verbose,1,0))
    return(list(bst=bst, breakVec=breakVec, genes=genes))
  }

}
huangwb8/GSClassifier documentation built on July 12, 2024, 5:10 p.m.