R/model_building.R

Defines functions predict_cl TrainModel ClassVarToFactor TrainTest TrainSetList BalancedSample

Documented in BalancedSample ClassVarToFactor predict_cl TrainModel TrainSetList TrainTest

#' Remove zeros
#'
#' This function removes events with 0 expression of mandatory makers (e.g. DNA 
#' intercalator) as well as cells with 0 expression in all optional markers 
#' (e.g. CD4, CD8, EpCAM, CD31, etc.). Mandatory markers are those which should 
#' be expressed for an event to be considered valid. Optional markers are those 
#' that allow cell type identification, therefore their absolute abscence make 
#' the event invalid because it cannot be identified as a specific cell type.
#'
#' @param df An object of class \code{data.frame}.
#' @param mand_inx A vector with the column indexes for the mandadory markers.
#' @param opt_idx A vector with the column indexes for the optional markers.
#'
#' @return Returns a \code{data.frame} with the invalid events already removed.
#' @export
rm_zeros <- function (df, mand_idx, opt_idx, ...){
        mand <- c()
        for (i in 1:length(mand_idx)){
            mand <- c(mand,(which(df[,mand_idx[i]] == 0)))
        }
        op <- c()
        for (i in 1:length(opt_idx)){
            op <- c(op,(which(df[,opt_idx[i]] == 0)))
        }
        # Select only the events == 0 for all 'optional' markers
        optional_combined <- as.data.frame(table(op))
        op <- as.numeric(subset(optional_combined, Freq == length(opt_idx), 
            select = op)$op)
        k <- unique(c(mand, op))
        if (length(k) == 0) {
            df <- df
        } else {
            df <- df[-k, ]
        }
        #cat('# of rows dropped: ', length(k), '\n')

    return(df)
}



#' Obtain a class-balanced sample
#'
#' This function samples the rows of a \code{data.frame} with balanced classes.
#' Useful when original training set has severely unbalanced classes.
#'
#' @param df An object of class \code{data.frame}.
#' @param sample_size Numeric. Size of the sample from each class.
#' @param class_col A character vector with the name of the column that identify 
#'        the classes.
#'
#' @return Returns a \code{data.frame} with n=\code{sample_size} rows per class
#'         randomly sampled.
#' @export 
BalancedSample <- function(df, sample_size = 5000, class_col = class_col){
    if(length(which(df[class_col] == 1))<sample_size){
        replace_1 <- TRUE
    }else{
        replace_1 <- FALSE
    }
    if(length(which(df[class_col] == 0))<sample_size){
        replace_0 <- TRUE
    }else{
        replace_O <- FALSE
    }
    idx <- c(sample(which(df[class_col] == 1), sample_size, replace = replace_1), #noise/beads
             sample(which(df[class_col] == 0), sample_size, replace = replace_0)) #cells
    df <- df[idx,]
    return(df)
}




#' Get a list of dataframes for training
#'
#' This function takes a list of \code{data.frame} and returns a random sample 
#' of the original list to be used for for training a model. The remaining 
#' \code{data.frame} should be used to test the model.
#' 
#' @param dt_ls A \code{list} of \code{data.frame}s
#' @param s_train Numeric. The size of the training set. Any value between (0:1)
#' 
#' @return A random subset of \code{data.frame}s as a \code{list}.
#' @export
TrainSetList <- function(dt_ls, s_train = 0.75) {
    train_size <- round(length(dt_ls)*s_train)
    dt_train <- sample(dt_ls, train_size)
    return(dt_train)
}



#' Get datasets for model training
#'
#' This function generates training and test sets for model training with 
#' balanced classes using functions \code{BalancedSample} and \code{TrainSetList}.
#' The training and test datasets as well as vectors that identify the file of 
#' origin of each row are saved as an RData object.
#'
#' @param dt_ls A \code{list} of \code{data.frame}s with balanced classes.
#' @param output_path A character vector with full path names.
#' @param label A character vector with the prefix of the RData file to be outputted.
#' @param class_col A character vector with the name of the column that identify 
#'        the classes.
#' @export
TrainTest <- function(dt_ls, output_path = output_path, label = label, 
    class_col = class_col, ...){
    dt_train <- TrainSetList(dt_ls, ...)
    k <- which(names(dt_ls)%in% names(dt_train))
    dt_test <- dt_ls[-k]

    class_size <- table(dt_ls[[1]][class_col])[1]

    # Save names of files used for training and test for reference
    train_nms <- rep(names(dt_train), each=class_size*2)
    test_nms <- rep(names(dt_test), each=class_size*2)

    train_set <- do.call("rbind", c(dt_train, make.row.names = FALSE))
    test_set <- do.call("rbind", c(dt_test, make.row.names = FALSE))
    test_train_l <- list('train_set' = train_set, 'test_set' = test_set)
    
    save(train_set, test_set, train_nms, test_nms, file = file.path(output_path, 
        paste0(label, '_TrainTest.RData')))
}



#' Converts the class labels from numeric to factor.
#'
#' A function to make the class column compatible with \code{caret} training function
#' for classification. Should be used when classes are labeled as numeric and
#' need to be transform to a factor.
#' 
#' @param df An object of class \code{data.frame}.
#' @param class_col A character vector with the name of the column that identify 
#'        the classes.
#' @param name_0 A character vector with the name for the class==0
#' @param name_1 A character vector with the name for the class==1
#'
#' @return Returns a \code{data.frame} with the original class labels converted
#'         into factors.
#' @export
ClassVarToFactor <- function(df, class_col = class_col, name_0 = name_0, 
  name_1 = name_1) {
  idx_0 <- which(df[class_col] == 0)
  idx_1 <- which(df[class_col] == 1)

  df[idx_0, class_col] <- name_0
  df[idx_1, class_col] <- name_1

  return(df)
}





#' Tunes and trains a classification model
#'
#' This function tunes and train a model (or models) for classification. Current
#' version supports Random Forest and XGBoost algorithms. Allows parallelization.
#' The final model, feature importance, prediction results of the test dataset 
#' and confusion matrix are outputted as an RData object.
#'
#' @param TrainSet A \code{data.frame} with balanced classes.
#' @param TestSet A \code{data.frame}.
#' @param alg A character vector with the name of the classification algorithm 
#'        to be used. Options are Random Forest 'RF', XGBoost 'XGB' or both 'all'.
#' @param class_col A character vector with the name of the column that identify 
#'        the classes.
#' @param seed A numeric vector to set a seed for reproducible results.
#' @param name_0 A character vector with the name for the class==0
#' @param name_1 A character vector with the name for the class==1
#' @param label A character vector with the prefix of the RData file to be outputted.
#' @param allowParallel Logical. If \code{TRUE} allows parallel computation.
#' @param free_cores A numeric vector with the number of cores to be left free 
#'        if \code{allowParallel=TRUE}'.
#' @export 
TrainModel <- function(TrainSet, TestSet, alg = c('all', 'RF', 'XGB'), 
    class_col = class_col, seed = 40, name_0 = name_0, name_1 = name_1,
    label = label, allowParallel = TRUE, free_cores = 4){
    # name_0 == 'cells' |  name_1 == 'debris'/'beads'/'dead'
    # Set seed
    set.seed(seed)    
    
    alg <- match.arg(alg, c('all', 'RF', 'XGB'))

    # Ask for channels for training features   
    print(as.matrix(colnames(TrainSet)))
    prompt <- "Enter the column INDICES of the training features (separated by 
              single space only, no comas allowed) \n"
    features <- as.numeric(strsplit(readline(prompt), " ")[[1]])
    features <- colnames(TrainSet)[features]
    features <- features[order(features)]

    # Classes to names
    TrainSet <- ClassVarToFactor(TrainSet, class_col = class_col, 
        name_0 = name_0, name_1 = name_1)
    TestSet <- ClassVarToFactor(TestSet, class_col = class_col, 
        name_0 = name_0, name_1 = name_1)

    TrainSet <- t_asinh(TrainSet[c(class_col, features)])
    TestSet <- t_asinh(TestSet[c(class_col, features)])

    colnames(TrainSet) <- c(class_col, features)

    colnames(TestSet) <- colnames(TrainSet)



    if(alg == 'RF' || alg == 'all'){
        print('Running Random Forest')
        # Initiating parallelization
        if(allowParallel){
        cluster <- parallel::makeCluster(parallel::detectCores() - free_cores) 
        doParallel::registerDoParallel(cluster)
        }
        # Computing training control parameters
        fitControl <- caret::trainControl(
            method = 'repeatedcv',
            number = 10,
            repeats = 3,
            search = 'grid',
            savePredictions = 'final',
            classProbs = T,
            summaryFunction = twoClassSummary,
            allowParallel = allowParallel,
            returnData = FALSE)

        # Setting tune grid
        tunegrid <- expand.grid(.mtry=c(1:length(features)))

        # Fitting the model
        model_rf <- caret::train(TrainSet[,-1], TrainSet[,1],
            method ='rf',
            trControl = fitControl,
            metric = 'ROC',
            tuneGrid=tunegrid)

        # Assesing feature importance
        ftimp_rf <- caret::varImp(model_rf)

        # Test
        # Prediction on test dataset
        pred_rf <- predict(model_rf, TestSet)
        
        # Compute confusion matrix
        conf_rf <- caret::confusionMatrix(
            reference = as.factor(TestSet[,1]),
            data = pred_rf,
            mode = 'everything',
            positive = name_1)

        #save RData
        save(model_rf, ftimp_rf, pred_rf, conf_rf, file = paste0(label, '_RFmodel.RData'))
        print('Random Forest completed')
        
        # Finalizing parallelization
        if(allowParallel){
        parallel::stopCluster(cluster)
        foreach::registerDoSEQ()
        }
    }
    

    if(alg == 'XGB' || alg == 'all'){
        print('Running XGBoost')
        # Rearrangement
        X_train = xgboost::xgb.DMatrix(as.matrix(TrainSet[,-1]))
        y_train = TrainSet[,1]

        # Computing training control parameters
        xgb_trcontrol = caret::trainControl(
            method = 'repeatedcv',
            number = 10,
            repeats = 3,
            search = 'grid',
            savePredictions = 'final',
            summaryFunction = twoClassSummary,
            allowParallel = allowParallel,
            classProbs = TRUE,
            verboseIter = FALSE,
            returnData = FALSE)

        # Specifying grid space
        xgbGrid <- expand.grid(nrounds = c(100,200),  # this is n_estimators in the python code above
                               max_depth = c(10, 15, 20, 25),
                               colsample_bytree = seq(0.5, 0.9, length.out = 5),
                               ## The values below are default values in the sklearn-api.
                               eta = 0.1,
                               gamma=0,
                               min_child_weight = 1,
                               subsample = 1)

        # Fitting the model
        model_xgb = caret::train(
            X_train, y_train,
            trControl = xgb_trcontrol,
            tuneGrid = xgbGrid,
            method = "xgbTree",
            metric = 'ROC',
            nthread = detectCores() - free_cores)

        # Assesing feature importance
        ftimp_xgb <- caret::varImp(model_xgb)
        
        # Test
        X_test = xgboost::xgb.DMatrix(as.matrix(TestSet[,-1]))
        y_test = TestSet[,1]

        # Prediction on test dataset
        pred_xgb <- predict(model_xgb, X_test)
        # Compute confusion matrix
        conf_xgb <- caret::confusionMatrix(
            reference = as.factor(y_test),
            data = pred_xgb,
            mode = 'everything',
            positive = name_1)

        # Save RData
        save(model_xgb, ftimp_xgb, pred_xgb, conf_xgb, file = paste0(label, '_XGBmodel.RData'))
        print('XGBoost completed')
    }

}




#' Predict class
#'
#' This function predicts the class of the events (rows) based on a classification 
#' model previously trained and returns the row indexes of the positive class.
#'
#' @param df An object of class \code{data.frame}.
#' @param model Object of class \code{train} that contains the model to be used 
#'        for classification of the new data.
#' @param alg A character vector with the name of the classification algorithm 
#'        used to train \code{model}.
#' @param features A character vector with the features used to train \code{model}
#'        (also known as predictors).
#' @param label A character vector with the name of the positive class. (e.g. 'beads')
#'
#' @return Returns a numeric vector with the row indexes of the positive class.
#' @export
predict_cl <- function(df, model = model, alg = c('RF', 'XGB'), 
  features = features, label = label){
    
    alg <- match.arg(alg, c('RF', 'XGB'))

    features <- features[order(features)]

    df <- t_asinh(df[features])
    
    if(alg == 'XGB'){
        df <- xgboost::xgb.DMatrix(as.matrix(df))

    }

    pred <- predict(model, df)

    k <- which(pred == label)

    return(k)
}
msenosain/denoisingCTF documentation built on Jan. 28, 2021, 2:23 a.m.