R/classifiers.R

Defines functions gliomaClassifier

Documented in gliomaClassifier

#' @title Gliomar classifier
#' @description Classify DNA methylation gliomas using data from
#' https://doi.org/10.1016/j.cell.2015.12.028
#' @param data DNA methylation matrix or Summarized Experiments with samples on columns
#' and probes on the rows
#' @export
#' @return A list of 3 data frames:
#' 1) Sample final classification
#' 2) Each model final classification
#' 3) Each class probability of classification
#' @examples
#' \dontrun{
#' query <- GDCquery(
#'    project= "TCGA-GBM",
#'    data.category = "DNA methylation",
#'    barcode = c("TCGA-06-0122","TCGA-14-1456"),
#'    platform = "Illumina Human Methylation 27",
#'    legacy = TRUE
#' )
#' GDCdownload(query)
#' data.hg19 <- GDCprepare(query)
#' classification <- gliomaClassifier(data.hg19)
#'
#' # Comparing reslts
#' TCGAquery_subtype("GBM") %>%
#' dplyr::filter(patient %in% c("TCGA-06-0122","TCGA-14-1456")) %>%
#' dplyr::select("patient","Supervised.DNA.Methylation.Cluster")
#' }
#' @author Tiago Chedraoui Silva, Tathiane Malta, Houtan Noushmehr
gliomaClassifier <- function(data){

    if (!requireNamespace("TCGAbiolinksGUI.data", quietly = TRUE)) {
        stop("TCGAbiolinksGUI.data package is needed for this function to work. Please install it.",
             call. = FALSE)
    }

    message("Plese cite https://doi.org/10.1016/j.cell.2015.12.028")
    message("Training model described at https://bioconductor.org/packages/TCGAbiolinksGUI.data/")

    if(is(data, "RangedSummarizedExperiment")) {
        met <- assay(data) %>% as.matrix  %>% t
    } else {
        met <- data %>% as.matrix  %>% t
    }


    df.all <- NULL
    env <- new.env()
    models <- c("idh","gcimp","idhwt","idhmut")
    models <- paste("glioma",models,"model",sep = ".")
    data(list = models, package = "TCGAbiolinksGUI.data",envir = env)

    for(i in models){
        message("------------------------------------------------------")
        message("Model: ",i)
        model <- get(i, envir = env)
        # If it is a Summarized Experiment object

        # keep only probes used in the model
        aux <- met[,colnames(met) %in% colnames(model$trainingData),drop = FALSE]
        # This should not happen!
        if(any(apply(aux,2,function(x) all(is.na(x))))) {
            message("o Probes has NA value for all samples. Setting to 0.5 since model does not accept NA")
            aux[,apply(aux,2,function(x) all(is.na(x)))] <- 0.5
        }

        if(any(apply(aux,2,function(x) any(is.na(x))))) {
            message("o Probes has NA values for some samples. Setting values a the median of the sample since model does not accept NA ")
            colMedians <- colMedians(aux,na.rm = TRUE)
            x <- which(is.na(aux),arr.ind = TRUE)
            for(l in 1:nrow(x)){
                aux[x[l,1],x[l,2]] <- colMedians[x[l,2]]
            }
        }

        # For missing probes add values to 0.5
        missing_probes <- setdiff(colnames(model$trainingData), colnames(met))
        if(length(missing_probes) > 0) {
            message("o Probes are missing. Setting dummy probes to matrix with 0.5 value to all samples.")
            missing_probes_matrix <- matrix(rep(0.5, nrow(met) * length(missing_probes)),nrow = nrow(met))
            colnames(missing_probes_matrix) <- missing_probes
            aux <- bind_cols(aux,missing_probes_matrix)
        }

        pred <- predict(model, aux)
        pred.prob <- predict(model, aux, type = "prob")
        colnames(pred.prob) <- paste0(i,"_", colnames(pred.prob))
        pred.prob$samples <- rownames(pred.prob)
        df <- data.frame(samples = rownames(aux),
                         groups.classified = pred,
                         stringsAsFactors = FALSE)
        colnames(df)[2] <- paste0(i,"_groups.classified")

        if(is.null(df.all)) {
            df.all <- df
            df.prob <- pred.prob
        } else {
            df.all <- merge(df.all,df, by = "samples")
            df.prob <- merge(df.prob,pred.prob,by = "samples")
        }
    }
    colnames(df.all) <- c("samples",models)
    fctr.cols <- sapply(df.all, is.factor)
    df.all[, fctr.cols] <- sapply(df.all[, fctr.cols], as.character)
    df.all[grep("6|5|4",df.all$glioma.idh.model),c("glioma.gcimp.model","glioma.idhmut.model")]  <- NA
    df.all[grep("3|2|1",df.all$glioma.idh.model),c("glioma.idhwt.model")]  <- NA
    df.all[grep("3",df.all$glioma.idhmut.model),c("glioma.gcimp.model")]  <- "Codel"
    df.all[grep("1",df.all$glioma.idhwt.model),c("glioma.idhwt.model")]  <- "Classic-like"
    df.all[grep("2",df.all$glioma.idhwt.model),c("glioma.idhwt.model")]  <- "Mesenchymal-like"
    df.all[grep("3",df.all$glioma.idhwt.model),c("glioma.idhwt.model")]  <- "PA-like"

    # Final column with results
    df.all$glioma.DNAmethylation.subtype <- NA
    df.all$glioma.DNAmethylation.subtype <- df.all$glioma.idhwt.model
    idx <- which(is.na(df.all$glioma.DNAmethylation.subtype))
    df.all$glioma.DNAmethylation.subtype[idx] <- df.all$glioma.gcimp.model[idx]

    return(
        list(
            final.classification = data.frame(
                Sample = df.all$samples,
                Final_classification = df.all$glioma.DNAmethylation.subtype
            ),
            model.classifications = df.all,
            model.probabilities = df.prob
        )
    )
}
BioinformaticsFMRP/TCGAbiolinks documentation built on April 12, 2024, 2:08 a.m.