R/MainLassoLDATraining.R

Defines functions bootstrap_parallel bootstrap_prediction predicting training

Documented in bootstrap_parallel bootstrap_prediction predicting training

#' Main model training function for finding the best model
#' that characterises a subpopulation
#'
#' @description  Training a haft of all cells to find optimal ElasticNet and LDA
#' models to predict a subpopulation
#' @param mixedpop1 is a \linkS4class{SingleCellExperiment} object from the 
#' train mixed population 
#' @param mixedpop2 is a \linkS4class{SingleCellExperiment} object from the 
#' target mixed population
#' @param genes a vector of gene names (for ElasticNet shrinkage); gene symbols
#' must be in the same format with gene names in subpop2. Note that genes are 
#' listed by the order of importance, e.g. differentially expressed genes that 
#' are most significan, so that if the gene list contains too many genes, only 
#' the top 500 genes are used.
#' @param cluster_mixedpop1 a vector of cluster assignment in mixedpop1
#' @param c_selectID a selected number to specify which subpopulation to be used
#' for training
#' @param out_idx a number to specify index to write results into the list 
#' output. This is needed for running bootstrap.
#' @param standardize a logical value specifying whether or not to standardize 
#' the train matrix
#' @param listData list to store output in
#' @param trainset_ratio a number specifying the proportion of cells to be part
#' of the training subpopulation
#' @param LDA_run logical, if the LDA run is added to compare to ElasticNet
#' @param log_transform boolean whether log transform should be computed
#' @return a \code{list} with prediction results written in to the indexed 
#' \code{out_idx}
#' @export
#' @author Quan Nguyen, 2017-11-25
#' @examples
#'
#' c_selectID<-1
#' out_idx<-1
#' day2 <- day_2_cardio_cell_sample
#' mixedpop1 <-new_scGPS_object(ExpressionMatrix = day2$dat2_counts, 
#'     GeneMetadata = day2$dat2geneInfo, CellMetadata = day2$dat2_clusters)
#' day5 <- day_5_cardio_cell_sample
#' mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts,
#' GeneMetadata = day5$dat5geneInfo, CellMetadata = day5$dat5_clusters)
#' genes <-training_gene_sample
#' genes <-genes$Merged_unique
#' listData  <- training(genes, 
#'     cluster_mixedpop1 = colData(mixedpop1)[, 1],
#'     mixedpop1 = mixedpop1, mixedpop2 = mixedpop2, c_selectID,
#'     listData =list(), out_idx=out_idx, trainset_ratio = 0.5)
#' names(listData)
#' listData$Accuracy

training <- function(genes = NULL, cluster_mixedpop1 = NULL, 
    mixedpop1 = NULL, mixedpop2 = NULL, c_selectID = NULL, listData = list(), 
    out_idx = 1, standardize = TRUE, trainset_ratio = 0.5, LDA_run = FALSE,
    log_transform = FALSE) {
    # subsammpling--------------------------------------------------------------
    # taking a subsampling size of trainset_ratio of the cluster_select out for
    # training
    subpop1cluster_indx <- which(cluster_mixedpop1 == c_selectID)  #class 1 
    message(paste0("Total ", length(subpop1cluster_indx), 
        " cells as source subpop"))
    subremaining1_indx <- which(cluster_mixedpop1 != c_selectID)#remaining class
    message(paste0("Total ", length(subremaining1_indx), 
        " cells in remaining subpops"))
    subsampling <- round(length(subpop1cluster_indx) * trainset_ratio)
    message(paste0("subsampling ", subsampling,
        " cells for training source subpop"))
    subpop1_train_indx <- sample(subpop1cluster_indx, subsampling,
        replace = FALSE)
    # check if there is a very big cluster present in the dataset 
    # C/2 = SubSampling > length(total - C, which is cluster_compare)
    if (length(subremaining1_indx) > subsampling) {
        message(paste0("subsampling ", subsampling, 
            " cells in remaining subpops for training"))
        subremaining1_train_indx <- sample(subremaining1_indx, subsampling, 
            replace = FALSE)
    } else {
        message(paste0("no subsampling, using ", length(subremaining1_indx), 
            " cells in remaining subpops for training"))
        subremaining1_train_indx <- subremaining1_indx
    }
    # done subsampling----------------------------------------------------------
    
    # select top 500 genes for the gene list used in the model------------------
    # The DE_result is a sorted list by p-values
    if (length(genes) > 500) {
        genes <- genes[seq_len(500)]
    }
    # select genes in both mixedpop1 and mixedpop2
    names1 <- elementMetadata(mixedpop1)[, 1]
    subpop1_selected_genes <- names1[which(names1 %in% genes)]
    names2 <- elementMetadata(mixedpop2)[, 1]
    genes_in_both <- names2[which(names2 %in% subpop1_selected_genes)]
    genes_in_both_idx1 <- which(names1 %in% genes_in_both)
    message(paste0("use ", length(genes_in_both_idx1), 
        " genes for training model"))
    # done selecting genes------------------------------------------------------
    
    # prepare ElasticNet training
    # matrices-----------------------------------------
    # prepare predictor matrix containing both clustering classes
    predictor_S1 <- mixedpop1[genes_in_both_idx1, c(subpop1_train_indx, 
        subremaining1_train_indx)]
    if (log_transform) {
        predictor_S1 <- assays(predictor_S1)$logcounts
    } else {
        predictor_S1 <- assays(predictor_S1)$counts
    }

    message(paste0("use ", dim(predictor_S1)[1], " genes ", 
        dim(predictor_S1)[2], " cells for testing model"))
    # generate categorical response
    
    # assign class labels############ rename the group of remaining clusters
    c_compareID <- unique(cluster_mixedpop1)
    c_compareID <- paste0(c_compareID[-which(c_compareID == c_selectID)], 
        collapse = "_")
    message(paste0("rename remaining subpops to ", c_compareID))
    # change cluster number to character
    c_selectID <- as.character(c_selectID)
    # done assigning class labels####
    
    # set all y values to cluster select (character type) length = #cells
    y_cat = rep(c_selectID, ncol(predictor_S1))
    
    # replace values for cluster compare get indexes for cells in predictor_S1
    # belong to 'remaining class'
    remainingClass_Indx_in_y <- which(colnames(predictor_S1) %in% 
        colnames(mixedpop1[, subremaining1_train_indx]))
    
    # change value of the cluster id for the remaining class
    y_cat[remainingClass_Indx_in_y] <- rep(c_compareID, 
        length(remainingClass_Indx_in_y))
    message(paste0("there are ", length(remainingClass_Indx_in_y), 
        " cells in class ", c_compareID, " and ", 
        length(y_cat) - length(remainingClass_Indx_in_y), " cells in class ", 
        c_selectID))
    # Done prepare ElasticNet training matrices---------------------------------
    
    # prepare training matrices------------------------------------------------
    
    # fitting with lda, also with cross validation
    dataset <- t(predictor_S1)  #(note predictor_S1 =t(gene_S1))
    # remove genes with no variation across cells and duplicated genes
    Zero_col <- which(colSums(dataset) == 0)
    duplicated_col <- which(duplicated(colnames(dataset)) == TRUE)
    if (length(c(Zero_col, duplicated_col)) != 0) {
        message(paste0("removing ", length(c(Zero_col, duplicated_col)), 
            " genes with no variance"))
        dataset <- dataset[, -c(Zero_col, duplicated_col)]
    }
    
    # scaled and centered data before training standardizing data (centered and
    # scaled) - this step is not necessary in the function glmnet
    standardizing <- function(X) {
        X <- X - mean(X)
        X <- X/sd(X)
        return(X)
    }
    
    if (standardize) {
        message("standardizing prediction/target dataset")
        dataset <- t(apply(dataset, 1, standardizing))  
        #dataset is transposed after standardised and scaled
    }
    dataset <- as.data.frame(dataset)
    dataset$Cluster_class <- as.character(as.vector(y_cat))
    # remove NA before prediction (if a gene has zero variance, it will have NA
    # values after standardization)
    dataset <- na.omit(dataset)
    # fitting with cross validation to find the best ElasticNet model
    message("performning elasticnet model training...")
    cvfit = cv.glmnet(as.matrix(dataset[, -which(colnames(dataset) == 
        "Cluster_class")]), as.vector(dataset$Cluster_class), 
        family = "binomial", type.measure = "class")
    
    # fit LDA
    if (LDA_run) {
        message("performning LDA model training...")
        trainControl <- trainControl(method = "repeatedcv", number = 10, 
            repeats = 3)
        fit.lda <- train(Cluster_class ~ ., preProcess = NULL, data = dataset, 
            method = "lda", metric = "Accuracy", trControl = trainControl, 
            na.action = na.omit, standardize = FALSE)
    }
    
    # Done fitting the ElasticNet and LDA models--------------------------------
    
    # Extract coefficient Beta for a gene for an optimized lambda value---------
    message("extracting deviance and best gene features...")
    cvfit_out <- as.matrix(coef(cvfit, s = cvfit$lambda.min))
    cvfit_out <- as.data.frame(cvfit_out)
    # Find number of genes with coefficient different to 0
    cvfit_out$name <- row.names(cvfit_out)
    sub_cvfit_out <- cvfit_out[cvfit_out$`1` != 0, ]
    # Extract deviance explained
    log <- utils::capture.output({
        t_DE <- as.matrix(print(cvfit$glmnet.fit))
        })
    dat_DE <- as.data.frame(t_DE)
    colnames(dat_DE) <- c("Dfd", "Deviance", "lambda")
    # Get the coordinate for lambda that produces minimum error
    dat_DE_Lambda_idx <- c()
    for (i in seq_len(nrow(dat_DE))) {
        if (dat_DE$lambda[i] == round(cvfit$lambda.min, 
            nchar(dat_DE$lambda[i]) - 2)) {
            dat_DE_Lambda_idx <- c(dat_DE_Lambda_idx, i)
        }
    }
    if (length(dat_DE_Lambda_idx) > 0) {
        dat_DE <- dat_DE[seq_len(dat_DE_Lambda_idx[1]), ]
        message(paste0("lambda min is at location ", dat_DE_Lambda_idx[1]))
    } else {
        message("no lambda min found, please check output ...")
    }
    
    dat_DE_fm_DE <- dat_DE %>% group_by(dat_DE$Dfd) %>% 
        summarise(Deviance = max(dat_DE$Deviance))
    dat_DE_fm_DE <- as.data.frame(dat_DE_fm_DE)
    dat_DE_fm_DE$DEgenes <- paste0("genes_cluster", c_selectID)
    remaining <- c("remaining DEgenes")
    dat_DE_fm_DE <- rbind(dat_DE_fm_DE, remaining)
    # Done extracting the coefficients------------------------------------------
    
    # fit the model on the leave-out data to estimate accuracy------------------
    
    # Keep all cells except for those used in the training set note that:
    # subpop1cluster_indx is for the total cells in the source subpop
    # subpop1_train_indx is for the subsampled cells as in
    # subpop1cluster_indx[sample(seq_len(ncol(mixedpop1)),subsampling ,
    #     replace = F)]
    
    cluster_select_indx_Round2 <- subpop1cluster_indx[-which(subpop1cluster_indx
        %in% subpop1_train_indx)]
    message(paste0("the leave-out cells in the source subpop is ", 
        length(cluster_select_indx_Round2)))
    
    # subremaining1_train_indx is the index for remaining subpops used for 
    # training
    # subremaining1_indx is the total cells in the remaining subpops
    
    subremaining1_train_order <- which(subremaining1_indx %in% 
        subremaining1_train_indx)  
        #to remove remaininng class already used for training 
    
    # Select cluster_compare_indx_Round2 as the remaining cells in class 2 
    # (opposite to class 1)
    if ((length(subremaining1_indx) - length(subremaining1_train_indx)) > 
        subsampling) {

        cluster_compare_indx_Round2 <- sample(subremaining1_indx[
            -subremaining1_train_order], subsampling, replace = FALSE)
        message(paste0("use ", subsampling, 
            " target subpops cells for leave-out test set"))
    } else {
        cluster_compare_indx_Round2 <- 
            subremaining1_indx[-subremaining1_train_order]  
            #use all of the class 2 cells after taken those for training 
        message(paste0("use ", length(cluster_compare_indx_Round2), 
            " target subpop cells for leave-out test set"))
    }
    
    # the cells present in select (class 1) vs compare (remaining class, i.e.  
    # class 2) c(cluster_select_indx_Round2, cluster_compare_indx_Round2) 
    # use the leave-out dataset in mixed pop1 to evaluate the model
    temp_S2 <- mixedpop1[, c(cluster_select_indx_Round2,
        cluster_compare_indx_Round2)]  #genes_in_trainset_idx_ordered

    if (log_transform) {
        temp_S2 <- assays(temp_S2)$logcounts
    } else {
        temp_S2 <- assays(temp_S2)$counts
    }

    predictor_S2 <- t(temp_S2)
    
    # before prediction, check genes in cvfit and predictor_S2 are compatible
    # Get gene names
    gene_cvfit <- cvfit$glmnet.fit$beta@Dimnames[[1]]
    names <- colnames(predictor_S2)
    cvfitGenes_idx <- match(gene_cvfit, as.character(names))  
    #names are genes in target subpop
    if (length(cvfitGenes_idx) == 0) {
        message("No genes in the model present in the data...check your input")
    }
    
    # check how many genes in gene_cvfit but not in mixedpop2
    to_add <- which(is.na(cvfitGenes_idx) == TRUE)
    if (length(to_add) > 0) {
        message(paste0("add ", length(to_add),
            " random indexes for genes in model but not in target subpop, ",
            "later to be replaced by 0"))
        to_add_idx <- sample(cvfitGenes_idx[-to_add], length(to_add), 
            replace = FALSE)
        cvfitGenes_idx[to_add] <- to_add_idx  #replace NA indexes by random 
    }
    
    if (length(cvfitGenes_idx) > 0) {
        predictor_S2 <- predictor_S2[, cvfitGenes_idx]
    }
    # replace gene values for those in model but not in prediction dataset by NA
    if (length(to_add) > 0) {
        message("Replacing missing genes by NA...")
        predictor_S2[, to_add] <- NA
    }
    
    # standardizing the leave-out target and source subpops
    if (standardize) {
        message("standardizing the leave-out target and source subpops...")
        predictor_S2 <- t(apply(predictor_S2, 1, standardizing))
    }
    # remove NA before prediction (if a gene has zero variance, it will have NA
    # values after standardization)
    predictor_S2 <- na.omit(predictor_S2)
    # Evaluation: predict ElasticNet Start prediction for estimating accuracy
    message("start ElasticNet prediction for estimating accuracy...")
    predict_clusters <- predict(cvfit, newx = predictor_S2, type = "class", 
        s = cvfit$lambda.min)
    
    # Evaluation: predict LDA
    if (LDA_run) {
        message(paste0("start LDA prediction for estimating accuracy for ", 
            nrow(predictor_S2), " cells and ", ncol(predictor_S2), " genes..."))
        lda_predict <- predict(fit.lda, predictor_S2)
    }
    # Done validation test to estimate accuracy---------------------------------
    
    # Estimate accuracy---------------------------------------------------------
    # Reference classification for comparing predicted results this 
    # cellNames_cluster is to calculate the accuracy
    cellNames_cluster <- cbind(colnames(mixedpop1), cluster_mixedpop1)
    # Compare to original clusering classes to check for accuracy (just for the
    # source subpop), using the initial cellNames_cluster
    predictor_S2_name <- row.names(predict_clusters)
    predict_label <- predict_clusters[, 1]
    
    predict_index <- which(cellNames_cluster[, 1] %in% predictor_S2_name)
    
    original_cluster <- cellNames_cluster[predict_index, ]
    original_cluster <- original_cluster[order(original_cluster[, 2], 
        decreasing = TRUE), ]
    original_cluster <- as.data.frame(original_cluster)
    # for ElasticNet
    predict_cluster_dat <- as.data.frame(predict_label)
    predict_cluster_dat$cellnames <- row.names(predict_cluster_dat)
    # merge original and predicted datasets
    compare <- merge(original_cluster, predict_cluster_dat, by.x = "V1", 
        by.y = "cellnames")
    # fing how many predictions are accurate
    cluster_select_predict <- subset(compare, 
        (as.numeric(compare[, 2]) == c_selectID & compare$predict_label == 
            c_selectID) | (as.numeric(compare[, 2]) != c_selectID & 
            compare$predict_label != c_selectID))
    accurate <- dim(cluster_select_predict)[1]
    inaccurate <- dim(compare)[1] - dim(cluster_select_predict)[1]
    list_acc_inacc <- list(accurate, inaccurate)
    message(paste0("evaluation accuracy ElasticNet ", 
        accurate/(accurate + inaccurate)))
    # for LDA
    if (LDA_run) {
        predict_cluster_dat_LDA <- as.data.frame(lda_predict)
        predict_cluster_dat_LDA$cellnames <- row.names(predictor_S2)
        # merge original and predicted datasets
        compare <- merge(original_cluster, predict_cluster_dat_LDA, by.x = "V1",
            by.y = "cellnames")
        # fing how many predictions are accurate
        cluster_select_predict_LDA <- subset(compare, 
            (as.numeric(compare[, 2]) == c_selectID & compare$lda_predict == 
                c_selectID) | (as.numeric(compare[, 2]) != c_selectID & 
                compare$lda_predict != c_selectID))
        accurate_LDA <- dim(cluster_select_predict_LDA)[1]
        inaccurate_LDA <- dim(compare)[1] - dim(cluster_select_predict_LDA)[1]
        list_acc_inacc <- list(accurate_LDA, inaccurate_LDA)
        message(paste0("evaluation accuracy LDA ", 
            accurate_LDA/(accurate_LDA + inaccurate_LDA)))
    }
    
    # done estimate accuracy----------------------------------------------------
    
    # write the 5 lists into the output object
    
    listData$Accuracy[[out_idx]] <- list(list_acc_inacc)
    listData$ElasticNetGenes[[out_idx]] <- list(sub_cvfit_out)
    listData$Deviance[[out_idx]] <- list(dat_DE_fm_DE)
    listData$ElasticNetFit[[out_idx]] <- list(cvfit)
    if (LDA_run) {
        listData$LDAFit[[out_idx]] <- list(fit.lda)
    } else {
        listData$LDAFit[[out_idx]] <- list(NA)
    }
    listData$predictor_S1[[out_idx]] <- list(t(dataset))
    
    return(listData)
}


#' Main prediction function applying the optimal ElasticNet and LDA models
#'
#' @description  Predict a new mixed population after training the model for a
#' subpopulation in the first mixed population.
#' All subpopulations in the new target mixed population will be predicted, 
#' where each targeted subpopulation will have a transition score from the 
#' orginal subpopulation to the new subpopulation.
#' @param listData a \code{list} object containing trained results for the
#' selected subpopulation in the first mixed population
#' @param mixedpop2 a \linkS4class{SingleCellExperiment} object from the target
#' mixed population of importance, e.g. differentially expressed genes that are
#' most significant
#' @param out_idx a number to specify index to write results into the list 
#' output. This is needed for running bootstrap.
#' @param cluster_mixedpop2 a vector of cluster assignment for mixedpop2
#' @param standardize a logical of whether to standardize the data 
#' @param LDA_run logical, if the LDA prediction is added to compare to 
#' ElasticNet, the LDA model needs to be trained from the training before
#' inputting to this prediction step
#' @param c_selectID a number to specify the trained cluster used for prediction
#' @param log_transform boolean whether log transform should be computed
#' @return a \code{list} with prediction results written in to the index
#' \code{out_idx}
#' @export
#' @author Quan Nguyen, 2017-11-25
#' @examples
#' c_selectID<-1
#' out_idx<-1
#' day2 <- day_2_cardio_cell_sample
#' mixedpop1 <-new_scGPS_object(ExpressionMatrix = day2$dat2_counts, 
#'     GeneMetadata = day2$dat2geneInfo, CellMetadata = day2$dat2_clusters)
#' day5 <- day_5_cardio_cell_sample
#' mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts, 
#'     GeneMetadata = day5$dat5geneInfo, CellMetadata = day5$dat5_clusters)
#' genes <-training_gene_sample
#' genes <-genes$Merged_unique
#' listData  <- training(genes, 
#'     cluster_mixedpop1 = colData(mixedpop1)[, 1], mixedpop1 = mixedpop1, 
#'     mixedpop2 = mixedpop2, c_selectID, listData =list(), out_idx=out_idx)
#' listData  <- predicting(listData =listData,  mixedpop2 = mixedpop2, 
#'     out_idx=out_idx, cluster_mixedpop2 = colData(mixedpop2)[, 1], 
#'     c_selectID = c_selectID)
#'


predicting <- function(listData = NULL, cluster_mixedpop2 = NULL, 
    mixedpop2 = NULL, out_idx = NULL, standardize = TRUE, LDA_run = FALSE, 
    c_selectID = NULL, log_transform = FALSE) {
    # predictor_S1 is the dataset used for the training phase 
    # (already transposed)
    predictor_S1 <- listData$predictor_S1[[out_idx]][[1]]  #1 for extract matrix
    cvfit_best <- listData$ElasticNetFit[[out_idx]][[1]]
    fit.lda <- listData$LDAFit[[out_idx]][[1]]
    
    my.clusters <- cluster_mixedpop2

    if (log_transform) {
        ori_dat_2 <- assays(mixedpop2)$logcounts
    } else {
        ori_dat_2 <- assays(mixedpop2)$counts
    }
    
    # standardizing data (centered and scaled)
    standardizing <- function(X) {
        X <- X - mean(X)
        X <- X/sd(X)
        return(X)
    }
    
    if (standardize) {
        message("standardizing target subpops before prediction...")
        ori_dat_2 <- t(apply(ori_dat_2, 1, standardizing))
    }
    
    list_predict_clusters_ElasticNet <- vector(mode = "list",
        length = length(unique(my.clusters)))
    list_predict_clusters_LDA <- vector(mode = "list",
        length = length(unique(my.clusters)))
    list_cell_results <- vector(mode = "list")
    
    for (clust in unique(my.clusters)) {
        message(paste0("predicting from source to target subpop ", clust, 
            "..."))
        c_selectID_2 <- clust
        cluster_select <- which(my.clusters == c_selectID_2)  #select cells
        message(paste0("number of cells in the target subpop ", clust, " is ", 
            length(cluster_select)))
        # Get gene names from the trained model
        target_genes <- elementMetadata(mixedpop2)[, 1]
        
        # A function to match target dataset
        matching_genes <- function(target_dataset, model_genes, 
            cluster_idx = cluster_select) {

            target_genes <- row.names(target_dataset)
            targetGenes_remove <- which(!(as.character(target_genes) %in%
                model_genes))  #genes in target but not in modelGenes
            message(paste0("Number of genes in the target data, ",
                "but not in model genes is ", length(targetGenes_remove)))
            target_dataset <- target_dataset[-targetGenes_remove, ]
            # update target genes
            target_genes <- row.names(target_dataset)
            # assumming the same gename formats between target and model
            modelGenes_idx <- match(model_genes, as.character(target_genes))  
            #which indexes of the target genes that match model genes
            message(paste0("Number of genes in the model present ",
                "in the target data is ", 
                length(na.omit(modelGenes_idx))))
            if (length(na.omit(modelGenes_idx)) == 0) {
                message(paste0("No genes in the model present in the data...",
                    "check your input"))
            }
            # check how many genes in model_genes but not in mixedpop2 and add 
            # those genes into the target
            to_add <- which(is.na(modelGenes_idx) == TRUE)
            message(paste0("There are ", length(to_add), 
                " genes that are in the model, ", 
                "but not in target subpopulations"))
            # update target_dataset
            predictor_S2_temp <- target_dataset[na.omit(modelGenes_idx),
                cluster_select]
            
            if (length(to_add) > 0) {
                message(paste0("for dim conformation, add ", length(to_add),
                    " random indexes for genes in model",
                    " but not in target subpop, later to be replaced by 0"))
                # if the genes not in the target dataset, 
                # set the expression values to 0
                to_add_df <- as.data.frame(matrix(0, nrow = length(to_add), 
                    ncol = length(cluster_select)))  #columns = number of cells
                row.names(to_add_df) <- gsub("`", "", model_genes[to_add]) 
                #format special names with '`MCM3AP-AS1`'
                message(paste0("genes to be added to target subpop are ", 
                    as.vector(row.names(to_add_df))))
                colnames(to_add_df) <- colnames(predictor_S2_temp)
                predictor_S2_temp <- rbind(predictor_S2_temp, to_add_df)
            }
            
            message(paste0("the prediction (target) subop has ",
                dim(predictor_S2_temp)[1], " genes and ", 
                dim(predictor_S2_temp)[2], " cells. The trained model has ", 
                length(model_genes), " genes"))
            message("first 10 genes in model ")
            message(head(model_genes, n = 10))
            message("first 10 genes in target ")
            message(head(row.names(predictor_S2_temp), n = 10))
            return(predictor_S2_temp)
        }
        
        # update predictor dataset
        predictor_S2_temp <- matching_genes(target_dataset = ori_dat_2, 
            model_genes = cvfit_best$glmnet.fit$beta@Dimnames[[1]], 
            cluster_idx = cluster_select)
        # predict ElasticNet:
        message("running elasticNet classification...")
        predict_clusters_ElasticNet <- predict(cvfit_best, 
            newx = t(predictor_S2_temp), type = "class",
            s = cvfit_best$lambda.min)
        list_cell_results[[clust]] <- predict_clusters_ElasticNet
        ElasticNet_result <- as.data.frame(table(predict_clusters_ElasticNet))

        #convert table() to 2x2 dataframe, 
        # it will always have 2 variable names: $name, Freq
        report_ElasticNet <- paste0("ElasticNet for subpop", c_selectID_2,
            " in target mixedpop2")
        # report class probability
        if (ncol(ElasticNet_result) == 2) {
            ElasticNet_cluster_idx <- which(ElasticNet_result[, 1] == 
                c_selectID)
            if (length(ElasticNet_cluster_idx) > 0) {

                predict_clusters_ElasticNet_proportion <- as.numeric(
                    ElasticNet_result[ElasticNet_cluster_idx, 2])/
                    sum(as.numeric(ElasticNet_result[, 2])) * 100
                message(paste0("class probability prediction ElasticNet",
                    " for target subpop ", clust, " is ", 
                    predict_clusters_ElasticNet_proportion))
                
            } else {
                predict_clusters_ElasticNet_proportion = 0
                message(paste0("class probability prediction ElasticNet ",
                    "for target subpop ", clust, " is 0"))
            }
            predict_clusters_ElasticNet <- list(report_ElasticNet, 
                predict_clusters_ElasticNet_proportion)
        } else {
            message(paste0("ElasticNet for target subpop ", clust,
                " has no solution"))
            predict_clusters_ElasticNet <- list(report_ElasticNet, "NA")
        }
        
        # write to the next list level in the list if bootstrap is used
        list_predict_clusters_ElasticNet <- c(list_predict_clusters_ElasticNet,
            predict_clusters_ElasticNet)
        
        # done checking---------------------------------------------------------
        # predict
        # LDA:
        if (LDA_run) 
            {
                # newdataset <- as.data.frame(t(ori_dat_2[,cluster_select])) 
                # for better LDA conversion, the target data should not be 
                # standardised
                if (log_transform) {
                    ori_dat_2 <- assays(mixedpop2)$logcounts
                } else {
                    ori_dat_2 <- assays(mixedpop2)$counts
                }
                
                newdataset <- matching_genes(target_dataset = ori_dat_2, 
                    model_genes = fit.lda$finalModel$xNames, 
                    cluster_idx = cluster_select)
                
                message("running LDA classification...")
                predict_clusters_LDA <- predict(fit.lda, t(newdataset), 
                    na.action = na.omit)
                
                LDA_result <- as.data.frame(table(predict_clusters_LDA))  
                # convert table() to 2x2 dataframe, it will always have 2 
                # variable names: $name,
                report_LDA <- paste0("LDA for subpop ", c_selectID_2, 
                    " in target mixedpop2")
                if (ncol(LDA_result) == 2) {
                    LDA_cluster_idx <- which(LDA_result[, 1] == c_selectID)
                    predict_clusters_LDA <- as.numeric(LDA_result[
                        LDA_cluster_idx, 2])/
                        sum(as.numeric(LDA_result[, 2])) * 100
                    message(paste0("class probability prediction LDA ",
                        "for target subpop ", 
                        clust, " is ", predict_clusters_LDA))
                    # write prediction output
                    predict_clusters_LDA <- list(report_LDA,
                        predict_clusters_LDA)
                } else {
                    message(paste0("prediction LDA has no solution for ",
                        "target subpop ", clust))
                    # write prediction output
                    predict_clusters_LDA <- list(report_LDA, "NA")
                }
                
                list_predict_clusters_LDA <- c(list_predict_clusters_LDA, 
                    predict_clusters_LDA)
            }  # end LDA run  
    }  # end the loop through all subpops
    
    # write to the next list level in the list if bootstrap is used to write
    # prediction result
    listData$ElasticNetPredict[out_idx] <- list(
        list_predict_clusters_ElasticNet)
    if (LDA_run) {
        listData$LDAPredict[out_idx] <- list(list_predict_clusters_LDA)
    } else {
        listData$LDAPredict[out_idx] <- list(NA)
    }
    listData$cell_results <- list_cell_results
    return(listData)
}


#' BootStrap runs for both scGPS training and prediction
#'
#' @description  ElasticNet and LDA prediction for each of all the 
#' subpopulations in the new mixed population after training the model for a 
#' subpopulation in the first mixed population. The number of bootstraps to be 
#' run can be specified.
#' @seealso \code{\link{bootstrap_parallel}} for parallel options
#' @param listData  a \code{list} object, which contains trained results for the
#' first mixed population
#' @param mixedpop1 a \linkS4class{SingleCellExperiment} object from a mixed 
#' population for training
#' @param mixedpop2 a \linkS4class{SingleCellExperiment} object from a target 
#' mixed population for prediction
#' @param cluster_mixedpop1 a vector of cluster assignment for mixedpop1 
#' @param cluster_mixedpop2 a vector of cluster assignment for mixedpop2 
#' @param c_selectID the root cluster in mixedpop1 to becompared to clusters in
#' mixedpop2 
#' @param genes a gene list to build the model
#' @param verbose a logical whether to display additional messages
#' @param nboots a number specifying how many bootstraps to be run
#' @param trainset_ratio a number specifying the proportion of cells to be part 
#' of the training subpopulation
#' @param LDA_run logical, if the LDA prediction is added to compare to 
#' ElasticNet
#' @param log_transform boolean whether log transform should be computed
#' @return a \code{list} with prediction results written in to the index 
#' \code{out_idx}
#' @export
#' @author Quan Nguyen, 2017-11-25
#' @examples
#' day2 <- day_2_cardio_cell_sample
#' mixedpop1 <-new_scGPS_object(ExpressionMatrix = day2$dat2_counts, 
#'     GeneMetadata = day2$dat2geneInfo, CellMetadata = day2$dat2_clusters)
#' day5 <- day_5_cardio_cell_sample
#' mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts, 
#'     GeneMetadata = day5$dat5geneInfo, CellMetadata = day5$dat5_clusters)
#' genes <-training_gene_sample
#' genes <-genes$Merged_unique
#' cluster_mixedpop1 <- colData(mixedpop1)[,1]
#' cluster_mixedpop2 <- colData(mixedpop2)[,1]
#' c_selectID <- 2
#' test <- bootstrap_prediction(nboots = 1, mixedpop1 = mixedpop1, 
#'     mixedpop2 = mixedpop2, genes=genes, listData =list(), 
#'     cluster_mixedpop1 = cluster_mixedpop1, 
#'     cluster_mixedpop2 = cluster_mixedpop2, c_selectID = c_selectID)
#' names(test)
#' test$ElasticNetPredict
#' test$LDAPredict

bootstrap_prediction <- function(nboots = 1, genes = genes, 
    mixedpop1 = mixedpop1, mixedpop2 = mixedpop2, c_selectID = NULL, 
    listData = list(), cluster_mixedpop1 = NULL, cluster_mixedpop2 = NULL, 
    trainset_ratio = 0.5, LDA_run = TRUE, verbose = FALSE, 
    log_transform = FALSE) {
    if (verbose) {
        for (out_idx in seq_len(nboots)) {
        listData <- training(genes = genes, mixedpop1 = mixedpop1, 
            mixedpop2 = mixedpop2, trainset_ratio = trainset_ratio, c_selectID,
            listData = listData, out_idx = out_idx, 
            cluster_mixedpop1 = cluster_mixedpop1, standardize = TRUE, 
            LDA_run = LDA_run, log_transform = log_transform)
        message(paste0("done training for bootstrap ", out_idx,
            ", moving to prediction..."))
        listData <- predicting(listData = listData, mixedpop2 = mixedpop2,
            out_idx = out_idx, standardize = TRUE, 
            cluster_mixedpop2 = cluster_mixedpop2, 
            LDA_run = LDA_run, c_selectID = c_selectID, 
            log_transform = log_transform)
        }
    } else {
        for (out_idx in seq_len(nboots)) {
            listData <- suppressMessages(training(genes = genes, 
                mixedpop1 = mixedpop1, mixedpop2 = mixedpop2, 
                trainset_ratio = trainset_ratio, c_selectID,
                listData = listData, out_idx = out_idx, 
                cluster_mixedpop1 = cluster_mixedpop1, standardize = TRUE, 
                LDA_run = LDA_run, log_transform = log_transform))
            message(paste0("done training for bootstrap ", out_idx,
                ", moving to prediction..."))
            listData <- suppressMessages(predicting(listData = listData,
                mixedpop2 = mixedpop2, out_idx = out_idx, standardize = TRUE, 
                cluster_mixedpop2 = cluster_mixedpop2, 
                LDA_run = LDA_run, c_selectID = c_selectID, 
                log_transform = log_transform))
        }
    }
        return(listData)
}


#' BootStrap runs for both scGPS training and prediction
#' with parallel option
#'
#' @description  same as bootstrap_prediction, but with an multicore option
#' @param listData  a \code{list} object, which contains trained results for the
#' first mixed population
#' @param mixedpop1 a \linkS4class{SingleCellExperiment} object from a mixed 
#' population for training
#' @param mixedpop2 a \linkS4class{SingleCellExperiment} object from a target 
#' mixed population for prediction
#' @param cluster_mixedpop1 a vector of cluster assignment for mixedpop1 
#' @param cluster_mixedpop2 a vector of cluster assignment for mixedpop2 
#' @param genes a gene list to build the model
#' @param nboots a number specifying how many bootstraps to be run
#' @param ncores a number specifying how many cpus to be used for running
#' @param c_selectID the root cluster in mixedpop1 to becompared to clusters in
#' mixedpop2 
#' @return a \code{list} with prediction results written in to the index 
#' \code{out_idx}
#' @export
#' @author Quan Nguyen, 2017-11-25
#' @examples
#' day2 <- day_2_cardio_cell_sample
#' mixedpop1 <-new_scGPS_object(ExpressionMatrix = day2$dat2_counts, 
#'     GeneMetadata = day2$dat2geneInfo, CellMetadata = day2$dat2_clusters)
#' day5 <- day_5_cardio_cell_sample
#' mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts, 
#'     GeneMetadata = day5$dat5geneInfo, CellMetadata = day5$dat5_clusters)
#' genes <-training_gene_sample
#' genes <-genes$Merged_unique
#' #prl_boots <- bootstrap_parallel(ncores = 4, nboots = 1, genes=genes,
#' #    mixedpop1 = mixedpop2, mixedpop2 = mixedpop2,  c_selectID=1,
#' #    listData =list())
#' #prl_boots[[1]]$ElasticNetPredict
#' #prl_boots[[1]]$LDAPredict
#'

bootstrap_parallel <- function(ncores = 4, nboots = 1, genes = genes, 
    mixedpop1 = mixedpop1, mixedpop2 = mixedpop2, c_selectID, listData = list(),
    cluster_mixedpop1 = NULL, cluster_mixedpop2 = NULL) {
    
    bootstrap_single <- function(genes = genes, mixedpop1 = mixedpop1, 
        mixedpop2 = mixedpop2, c_selectID = c_selectID, out_idx = 1, 
        listData = list(), cluster_mixedpop1 = NULL, cluster_mixedpop2 = NULL) {
        listData <- training(genes = genes, mixedpop1 = mixedpop1, 
            mixedpop2 = mixedpop2, c_selectID, listData = listData, 
            out_idx = 1, cluster_mixedpop1 = cluster_mixedpop1)
        listData <- predicting(listData = listData, mixedpop2 = mixedpop2,
            out_idx = 1, standardize = TRUE, 
            cluster_mixedpop2 = cluster_mixedpop2)
        return(listData)
    }
    
    BiocParallel::register(BiocParallel::MulticoreParam(workers = ncores, 
        progressbar = TRUE))
    
    
    listData <- BiocParallel::bplapply(seq_len(nboots), bootstrap_single, 
        genes = genes, mixedpop1 = mixedpop1, mixedpop2 = mixedpop2, 
        c_selectID = c_selectID, listData = list())
    return(listData)
}
IMB-Computational-Genomics-Lab/scGPS documentation built on Dec. 6, 2020, 3:20 p.m.