R/analyze.R

Defines functions TCGAanalyze_Stemness matchedMetExp getAdjacencyBiogrid TCGAanalyze_networkInference TCGAanalyze_Pathview TCGAanalyze_analyseGRN TCGAanalyze_DEA_Affy TCGAanalyze_EAcomplete TCGAanalyze_LevelTab UseRaw_afterFilter TCGAanalyze_Normalization TCGAanalyze_Filtering TCGAanalyze_SurvivalKM TCGAanalyze_Preprocessing TCGAanalyze_Clustering

Documented in getAdjacencyBiogrid matchedMetExp TCGAanalyze_analyseGRN TCGAanalyze_Clustering TCGAanalyze_DEA_Affy TCGAanalyze_EAcomplete TCGAanalyze_Filtering TCGAanalyze_LevelTab TCGAanalyze_networkInference TCGAanalyze_Normalization TCGAanalyze_Pathview TCGAanalyze_Preprocessing TCGAanalyze_Stemness TCGAanalyze_SurvivalKM UseRaw_afterFilter

#' @title Hierarchical cluster analysis
#' @description Hierarchical cluster analysis using several methods such as
#'  ward.D", "ward.D2", "single", "complete", "average" (= UPGMA),
#'  "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' @param tabDF is a dataframe or numeric matrix, each row represents a gene,
#' each column represents a sample come from TCGAPrepare.
#' @param method is method to be used for generic cluster such as 'hclust'
#' or 'consensus'
#' @param methodHC is method to be used for Hierarchical cluster.
#' @import stats
#' @export
#' @return object of class hclust if method selected is 'hclust'.
#' If method selected is 'Consensus' returns a list of length maxK
#' (maximum cluster number to evaluate.). Each element is a list containing
#' consensusMatrix (numerical matrix), consensusTree (hclust), consensusClass
#' (consensus class assignments). ConsensusClusterPlus also produces images.
TCGAanalyze_Clustering <-
    function(tabDF, method,  methodHC = "ward.D2") {
        if (!requireNamespace("ConsensusClusterPlus", quietly = TRUE)) {
            stop("ConsensusClusterPlus is needed. Please install it.",
                 call. = FALSE)
        }

        if (method == "hclust") {
            ans <- hclust(ddist <- dist(tabDF), method = methodHC)
        }

        if (method == "consensus") {
            sHc <- hclust(ddist <- dist(tabDF), method = methodHC)
            ans <-
                ConsensusClusterPlus::ConsensusClusterPlus(
                    ddist,
                    maxK = 7,
                    pItem = 0.9,
                    reps = 1000,
                    title = "mc_consensus_k7_1000",
                    clusterAlg = "hc",
                    innerLinkage = "ward.D2",
                    finalLinkage = "complete",
                    plot = 'pdf',
                    writeTable = TRUE
                )
        }

        return(ans)
    }


#' @title Array Array Intensity correlation (AAIC) and correlation boxplot to define outlier
#' @description TCGAanalyze_Preprocessing perform Array Array Intensity correlation (AAIC).
#' It defines a square symmetric matrix of spearman correlation among samples.
#' According this matrix and boxplot of correlation samples by samples it is possible
#' to find samples with low correlation that can be identified as possible outliers.
#' @param object of gene expression of class RangedSummarizedExperiment from TCGAprepare
#' @param cor.cut is a threshold to filter samples according their spearman correlation in
#' samples by samples. default cor.cut is 0
#' @param filename Filename of the image file
#' @param width Image width
#' @param height Image height
#' @param datatype is a string from RangedSummarizedExperiment assay
#' @importFrom grDevices dev.list
#' @importFrom SummarizedExperiment assays
#' @export
#' @return Plot with array array intensity correlation and boxplot of correlation samples by samples
TCGAanalyze_Preprocessing <- function(object,
                                      cor.cut = 0,
                                      filename = NULL,
                                      width = 1000,
                                      height = 1000,
                                      datatype = names(assays(object))[1]) {
    # This is a work around for raw_counts and raw_count
    if (grepl("raw_count", datatype) &
        any(grepl("raw_count", names(assays(object)))))
        datatype <-
            names(assays(object))[grepl("raw_count", names(assays(object)))]

    if (!any(grepl(datatype, names(assays(object)))))
        stop(
            paste0(
                datatype,
                " not found in the assay list: ",
                paste(names(assays(object)), collapse = ", "),
                "\n  Please set the correct datatype argument."
            )
        )

    if (!(is.null(dev.list()["RStudioGD"]))) {
        dev.off()
    }
    if (is.null(filename))
        filename <- "PreprocessingOutput.png"
    png(filename, width = width, height = height)
    par(oma = c(10, 10, 10, 10))
    ArrayIndex <-  as.character(1:length(colData(object)$barcode))

    pmat_new <- matrix(0, length(ArrayIndex), 4)
    colnames(pmat_new) <- c("Disease", "platform", "SampleID", "Study")
    rownames(pmat_new) <- as.character(colData(object)$barcode)
    pmat_new <- as.data.frame(pmat_new)
    pmat_new$Disease <- as.character(colData(object)$definition)
    pmat_new$platform <- "platform"
    pmat_new$SampleID <- as.character(colData(object)$barcode)
    pmat_new$Study <- "study"

    tabGroupCol <-
        cbind(pmat_new, Color = matrix(0, nrow(pmat_new), 1))
    for (i in seq_along(unique(tabGroupCol$Disease))) {
        tabGroupCol[which(tabGroupCol$Disease == tabGroupCol$Disease[i]), "Color"] <-
            rainbow(length(unique(tabGroupCol$Disease)))[i]
    }

    #    pmat <- as.matrix(pData(phenoData(object)))
    pmat <- pmat_new
    phenodepth <- min(ncol(pmat), 3)
    order <- switch(
        phenodepth + 1,
        ArrayIndex,
        order(pmat[, 1]),
        order(pmat[, 1], pmat[, 2]),
        order(pmat[, 1],
              pmat[, 2], pmat[, 3])
    )
    arraypos <-
        (1:length(ArrayIndex)) * (1 / (length(ArrayIndex) - 1)) - (1 / (length(ArrayIndex) - 1))
    arraypos2 = seq(1:length(ArrayIndex) - 1)
    for (i in 2:length(ArrayIndex)) {
        arraypos2[i - 1] <- (arraypos[i] + arraypos[i - 1]) / 2
    }
    layout(matrix(c(1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 3, 3, 3, 4), 4, 4, byrow = TRUE))

    c <- cor(assay(object, datatype)[, order], method = "spearman")

    image(c,
          xaxt = "n",
          yaxt = "n",
          #xlab = "Array Samples",
          #ylab = "Array Samples",
          main = "Array-Array Intensity Correlation after RMA")

    for (i in 1:length(names(table(tabGroupCol$Color)))) {
        currentCol <- names(table(tabGroupCol$Color))[i]
        pos.col <- arraypos[which(tabGroupCol$Color == currentCol)]
        lab.col <-
            colnames(c)[which(tabGroupCol$Color == currentCol)]
        #axis(1, labels = lab.col , at = pos.col, col = currentCol,lwd = 6,las = 2)
        axis(
            2,
            labels = lab.col ,
            at = pos.col,
            col = currentCol,
            lwd = 6,
            las = 2
        )
    }

    m <-
        matrix(pretty(c, 10), nrow = 1, ncol = length(pretty(c, 10)))

    image(m,
          xaxt = "n",
          yaxt = "n",
          ylab = "Correlation Coefficient")

    axis(2,
         labels = as.list(pretty(c, 10)),
         at = seq(0, 1, by = (1 / (
             length(pretty(c,  10)) - 1
         ))))

    abline(h = seq((1 / (
        length(pretty(c, 10)) - 1
    )) / 2,
    1 - (1 / (
        length(pretty(c, 10)) - 1
    )),
    by = (1 / (
        length(pretty(c, 10)) - 1
    ))))
    box()
    boxplot(
        c,
        outline = FALSE,
        las = 2,
        lwd = 6,
        # names = NULL,
        col = tabGroupCol$Color,
        main = "Boxplot of correlation samples by samples after normalization"
    )
    dev.off()

    samplesCor <- rowMeans(c)
    objectWO <-  assay(object, datatype)[, samplesCor > cor.cut]
    colnames(objectWO) <- colnames(object)[samplesCor > cor.cut]

    return(objectWO)
}

#' @title survival analysis (SA) univariate with Kaplan-Meier (KM) method.
#' @description TCGAanalyze_SurvivalKM perform an univariate Kaplan-Meier (KM) survival analysis (SA).
#' It performed Kaplan-Meier survival univariate using complete follow up with all days
#' taking one gene a time from Genelist of gene symbols.
#' For each gene according its level of mean expression in cancer samples,
#' defining two thresholds for quantile
#' expression of that gene in all samples (default ThreshTop=0.67,ThreshDown=0.33) it is possible
#' to define a threshold of intensity of gene expression to divide the samples in 3 groups
#' (High, intermediate, low).
#' TCGAanalyze_SurvivalKM performs SA between High and low groups using following functions
#' from survival package
#' \enumerate{
#' \item survival::Surv
#' \item survival::survdiff
#' \item survival::survfit
#' }
#' @param clinical_patient is a data.frame using function 'clinic' with information
#' related to barcode / samples such as bcr_patient_barcode, days_to_death ,
#' days_to_last_follow_up , vital_status, etc
#' @param dataGE is a matrix of Gene expression (genes in rows, samples in cols) from TCGAprepare
#' @param Genelist is a list of gene symbols where perform survival KM.
#' @param Survresult is a parameter (default = FALSE) if is TRUE will show KM plot and results.
#' @param ThreshTop is a quantile threshold to identify samples with high expression of a gene
#' @param ThreshDown is a quantile threshold to identify samples with low expression of a gene
#' @param p.cut p.values threshold. Default: 0.05
#' @param group1 a string containing the barcode list of the samples in in control group
#' @param group2 a string containing the barcode list of the samples in in disease group
#' @export
#' @return table with survival genes pvalues from KM.
#' @examples
#'  # Selecting only 20 genes for example
#'  dataBRCAcomplete <- log2(dataBRCA[1:20,] + 1)
#'
#'  # clinical_patient_Cancer <- GDCquery_clinic("TCGA-BRCA","clinical")
#'  clinical_patient_Cancer <- data.frame(
#'       bcr_patient_barcode = substr(colnames(dataBRCAcomplete),1,12),
#'       vital_status = c(rep("alive",3),"dead",rep("alive",2),rep(c("dead","alive"),2)),
#'       days_to_death = c(NA,NA,NA,172,NA,NA,3472,NA,786,NA),
#'       days_to_last_follow_up = c(3011,965,718,NA,1914,423,NA,5,656,1417)
#'  )
#'
#'  group1 <- TCGAquery_SampleTypes(colnames(dataBRCAcomplete), typesample = c("NT"))
#'  group2 <- TCGAquery_SampleTypes(colnames(dataBRCAcomplete), typesample = c("TP"))
#'
#'  tabSurvKM <- TCGAanalyze_SurvivalKM(clinical_patient_Cancer,
#'                                      dataBRCAcomplete,
#'                                      Genelist = rownames(dataBRCAcomplete),
#'                                      Survresult = FALSE,
#'                                      p.cut = 0.4,
#'                                      ThreshTop = 0.67,
#'                                      ThreshDown = 0.33,
#'                                      group1 = group1, # Control group
#'                                      group2 = group2) # Disease group
#'
#'  # If the groups are not specified group1 == group2 and all samples are used
#'  \dontrun{
#'  tabSurvKM <- TCGAanalyze_SurvivalKM(clinical_patient_Cancer,
#'                                      dataBRCAcomplete,
#'                                      Genelist = rownames(dataBRCAcomplete),
#'                                      Survresult = TRUE,
#'                                      p.cut = 0.2,
#'                                      ThreshTop = 0.67,
#'                                      ThreshDown = 0.33)
#' }
TCGAanalyze_SurvivalKM <- function(
    clinical_patient,
    dataGE,
    Genelist,
    Survresult = FALSE,
    ThreshTop = 0.67,
    ThreshDown = 0.33,
    p.cut = 0.05,
    group1,
    group2
) {

    check_package("survival")
    # Check which genes we really have in the matrix
    Genelist <- intersect(rownames(dataGE), Genelist)

    # Split gene expression matrix btw the groups
    dataCancer <- dataGE[Genelist, group2, drop = FALSE]
    dataNormal <- dataGE[Genelist, group1, drop = FALSE]
    colnames(dataCancer)  <- substr(colnames(dataCancer), 1, 12)

    cfu <-
        clinical_patient[clinical_patient[, "bcr_patient_barcode"] %in% substr(colnames(dataCancer), 1, 12), ]
    if ("days_to_last_followup" %in% colnames(cfu))
        colnames(cfu)[grep("days_to_last_followup", colnames(cfu))] <-
        "days_to_last_follow_up"
    cfu <-
        as.data.frame(subset(
            cfu,
            select = c(
                "bcr_patient_barcode",
                "days_to_death",
                "days_to_last_follow_up",
                "vital_status"
            )
        ))

    # Set alive death to inf
    if (length(grep("alive", cfu$vital_status, ignore.case = TRUE)) > 0)
        cfu[grep("alive", cfu$vital_status, ignore.case = TRUE), "days_to_death"] <-
        "-Inf"

    # Set dead follow up to inf
    if (length(grep("dead", cfu$vital_status, ignore.case = TRUE)) > 0)
        cfu[grep("dead", cfu$vital_status, ignore.case = TRUE), "days_to_last_follow_up"] <-
        "-Inf"

    cfu <- cfu[!(is.na(cfu[, "days_to_last_follow_up"])), ]
    cfu <- cfu[!(is.na(cfu[, "days_to_death"])), ]

    followUpLevel <- FALSE

    #FC_FDR_table_mRNA
    tabSurv_Matrix <-
        matrix(0, nrow(as.matrix(rownames(dataNormal))), 8)
    colnames(tabSurv_Matrix) <- c(
        "mRNA",
        "pvalue",
        "Cancer Deaths",
        "Cancer Deaths with Top",
        "Cancer Deaths with Down",
        "Mean Tumor Top",
        "Mean Tumor Down",
        "Mean Normal"
    )

    tabSurv_Matrix <- as.data.frame(tabSurv_Matrix)

    cfu$days_to_death <- as.numeric(as.character(cfu$days_to_death))
    cfu$days_to_last_follow_up <-
        as.numeric(as.character(cfu$days_to_last_follow_up))
    rownames(cfu) <- cfu[, "bcr_patient_barcode"] #mod1

    cfu <- cfu[!(is.na(cfu[, "days_to_last_follow_up"])), ]
    cfu <- cfu[!(is.na(cfu[, "days_to_death"])), ]

    cfu_complete <- cfu
    ngenes <- nrow(as.matrix(rownames(dataNormal)))

    # Evaluate each gene
    for (i in 1:nrow(as.matrix(rownames(dataNormal))))  {
        cat(paste0((ngenes - i), "."))
        mRNAselected <- as.matrix(rownames(dataNormal))[i]
        mRNAselected_values <-
            dataCancer[rownames(dataCancer) == mRNAselected, ]
        mRNAselected_values_normal <-
            dataNormal[rownames(dataNormal) == mRNAselected, ]
        if (all(mRNAselected_values == 0))
            next # All genes are 0
        tabSurv_Matrix[i, "mRNA"] <- mRNAselected


        # Get Thresh values for cancer expression
        mRNAselected_values_ordered <-
            sort(mRNAselected_values, decreasing = TRUE)
        mRNAselected_values_ordered_top <-
            as.numeric(quantile(as.numeric(mRNAselected_values_ordered), ThreshTop)[1])
        mRNAselected_values_ordered_down <-
            as.numeric(quantile(as.numeric(mRNAselected_values_ordered), ThreshDown)[1])

        mRNAselected_values_newvector <- mRNAselected_values


        if (!is.na(mRNAselected_values_ordered_top)) {
            # How many samples do we have
            numberOfSamples <- length(mRNAselected_values_ordered)

            # High group (above ThreshTop)
            lastelementTOP <-
                max(which(
                    mRNAselected_values_ordered > mRNAselected_values_ordered_top
                ))

            # Low group (below ThreshDown)
            firstelementDOWN <-
                min(
                    which(
                        mRNAselected_values_ordered <= mRNAselected_values_ordered_down
                    )
                )

            samples_top_mRNA_selected <-
                names(mRNAselected_values_ordered[1:lastelementTOP])
            samples_down_mRNA_selected <-
                names(mRNAselected_values_ordered[firstelementDOWN:numberOfSamples])

            # Which samples are in the intermediate group (above ThreshLow and below ThreshTop)
            samples_UNCHANGED_mRNA_selected <-
                names(mRNAselected_values_newvector[which((mRNAselected_values_newvector) > mRNAselected_values_ordered_down &
                                                              mRNAselected_values_newvector < mRNAselected_values_ordered_top
                )])

            cfu_onlyTOP <-
                cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_top_mRNA_selected, ]
            cfu_onlyDOWN <-
                cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_down_mRNA_selected, ]
            cfu_onlyUNCHANGED <-
                cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_UNCHANGED_mRNA_selected, ]

            cfu_ordered <- NULL
            cfu_ordered <- rbind(cfu_onlyTOP, cfu_onlyDOWN)
            cfu <- cfu_ordered

            ttime <- as.numeric(cfu[, "days_to_death"])

            sum(status <- ttime > 0) # morti
            deads_complete <- sum(status <- ttime > 0)

            ttime_only_top <- cfu_onlyTOP[, "days_to_death"]
            deads_top <- sum(ttime_only_top > 0)

            if (dim(cfu_onlyDOWN)[1] >= 1) {
                ttime_only_down <- cfu_onlyDOWN[, "days_to_death"]
                deads_down <- sum(ttime_only_down > 0)
            } else {
                deads_down <- 0
            }

            tabSurv_Matrix[i, "Cancer Deaths"] <- deads_complete
            tabSurv_Matrix[i, "Cancer Deaths with Top"] <- deads_top
            tabSurv_Matrix[i, "Cancer Deaths with Down"] <-
                deads_down
            tabSurv_Matrix[i, "Mean Normal"] <-
                mean(as.numeric(mRNAselected_values_normal))
            dataCancer_onlyTop_sample <-
                dataCancer[, samples_top_mRNA_selected, drop = FALSE]
            dataCancer_onlyTop_sample_mRNASelected <-
                dataCancer_onlyTop_sample[rownames(dataCancer_onlyTop_sample) == mRNAselected, ]
            dataCancer_onlyDown_sample <-
                dataCancer[, samples_down_mRNA_selected, drop = FALSE]
            dataCancer_onlyDown_sample_mRNASelected <-
                dataCancer_onlyDown_sample[rownames(dataCancer_onlyDown_sample) == mRNAselected, ]
            tabSurv_Matrix[i, "Mean Tumor Top"] <-
                mean(as.numeric(dataCancer_onlyTop_sample_mRNASelected))
            tabSurv_Matrix[i, "Mean Tumor Down"] <-
                mean(as.numeric(dataCancer_onlyDown_sample_mRNASelected))

            ttime[!status] <-
                as.numeric(cfu[!status, "days_to_last_follow_up"])
            ttime[which(ttime == -Inf)] <- 0

            ttime <- survival::Surv(ttime, status)
            rownames(ttime) <- rownames(cfu)
            legendHigh <- paste(mRNAselected, "High")
            legendLow  <- paste(mRNAselected, "Low")

            tabSurv_pvalue <- tryCatch({
                tabSurv <-
                    survival::survdiff(ttime  ~ c(rep(
                        "top", nrow(cfu_onlyTOP)
                    ), rep(
                        "down", nrow(cfu_onlyDOWN)
                    )))
                tabSurv_chis <- unlist(tabSurv)$chisq
                tabSurv_pvalue <-
                    as.numeric(1 - pchisq(abs(tabSurv$chisq), df = 1))
            }, error = function(e) {
                return(Inf)
            })
            tabSurv_Matrix[i, "pvalue"] <- tabSurv_pvalue

            if (Survresult == TRUE) {
                titlePlot <-
                    paste("Kaplan-Meier Survival analysis, pvalue=",
                          tabSurv_pvalue)
                plot(
                    survival::survfit(ttime ~ c(
                        rep("low", nrow(cfu_onlyTOP)), rep("high", nrow(cfu_onlyDOWN))
                    )),
                    col = c("green", "red"),
                    main = titlePlot,
                    xlab = "Days",
                    ylab = "Survival"
                )
                legend(
                    100,
                    1,
                    legend = c(legendLow, legendHigh),
                    col = c("green", "red"),
                    text.col = c("green", "red"),
                    pch = 15
                )
                print(tabSurv)
            }
        } #end if

    } #end for

    tabSurv_Matrix[tabSurv_Matrix == "-Inf"] <- 0

    tabSurvKM <- tabSurv_Matrix

    # Filtering by selected pvalue < 0.01
    tabSurvKM <- tabSurvKM[tabSurvKM$mRNA != 0, ]
    tabSurvKM <- tabSurvKM[tabSurvKM$pvalue < p.cut, ]
    tabSurvKM <- tabSurvKM[!duplicated(tabSurvKM$mRNA), ]
    rownames(tabSurvKM) <- tabSurvKM$mRNA
    tabSurvKM <- tabSurvKM[, -1]
    tabSurvKM <-
        tabSurvKM[order(tabSurvKM$pvalue, decreasing = FALSE), ]

    colnames(tabSurvKM) <-
        gsub("Cancer", "Group2", colnames(tabSurvKM))
    colnames(tabSurvKM) <-
        gsub("Tumor", "Group2", colnames(tabSurvKM))
    colnames(tabSurvKM) <-
        gsub("Normal", "Group1", colnames(tabSurvKM))


    return(tabSurvKM)
}


#' @title Filtering mRNA transcripts and miRNA selecting a threshold.
#' @description
#'    TCGAanalyze_Filtering allows user to filter mRNA transcripts and miRNA,
#'    samples, higher than the threshold defined quantile mean across all samples.
#' @param tabDF is a dataframe or numeric matrix, each row represents a gene,
#' each column represents a sample come from TCGAPrepare
#' @param method is method of filtering such as 'quantile', 'varFilter', 'filter1', 'filter2'
#' @param qnt.cut is threshold selected as mean for filtering
#' @param var.func is function used as the per-feature filtering statistic.
#' See genefilter documentation
#' @param var.cutoff is a numeric value. See genefilter documentation
#' @param eta is a parameter for filter1. default eta = 0.05.
#' @param foldChange is a parameter for filter2. default foldChange = 1.
#' @export
#' @return A filtered dataframe or numeric matrix where each row represents a gene,
#' each column represents a sample
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataNorm <- TCGAanalyze_Normalization(tabDF = dataBRCA,
#' geneInfo = geneInfo,
#' method = "geneLength")
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm, method = "quantile", qnt.cut = 0.25)
TCGAanalyze_Filtering <- function(tabDF,
                                  method,
                                  qnt.cut = 0.25,
                                  var.func = IQR,
                                  var.cutoff = 0.75,
                                  eta = 0.05,
                                  foldChange = 1) {
    if (method == "quantile") {
        GeneThresh <- as.numeric(quantile(rowMeans(tabDF), qnt.cut))
        geneFiltered <- names(which(rowMeans(tabDF) > GeneThresh))
        tabDF_Filt <- tabDF[geneFiltered,]
    }

    if (method == "varFilter") {
        check_package("genefilter")
        tabDF_Filt <- genefilter::varFilter(
            tabDF,
            var.func = IQR,
            var.cutoff = 0.75,
            filterByQuantile = TRUE
        )
    }

    if (method == "filter1") {
        normCounts <- tabDF
        geData <- t(log(1 + normCounts, 2))
        filter <-
            apply(geData, 2, function(x)
                sum(quantile(x, probs = c(1 - eta, eta)) * c(1, -1)))
        tabDF_Filt <- geData[, which(filter > foldChange)]
    }

    if (method == "filter2") {
        geData <- tabDF
        filter <-
            apply(geData, 2, function(x)
                prod(quantile(x, probs =  c(1 - eta, eta)) - 10) < 0)
        tabDF_Filt <- geData[, which(filter)]
    }

    return(tabDF_Filt)
}

#' @title normalization mRNA transcripts and miRNA using EDASeq package.
#' @description
#'   TCGAanalyze_Normalization allows user to normalize mRNA transcripts and miRNA,
#'    using EDASeq package.
#'
#'    Normalization for RNA-Seq Numerical and graphical
#'     summaries of RNA-Seq read data. Within-lane normalization procedures
#'    to adjust for GC-content effect (or other gene-level effects) on read counts:
#'    loess robust local regression, global-scaling, and full-quantile normalization
#'    (Risso et al., 2011). Between-lane normalization procedures to adjust for
#'    distributional differences between lanes (e.g., sequencing depth):
#'    global-scaling and full-quantile normalization (Bullard et al., 2010).
#'
#'    For istance returns all mRNA or miRNA with mean across all
#'    samples, higher than the threshold defined quantile mean across all samples.
#'
#'    TCGAanalyze_Normalization performs normalization using following functions
#'    from EDASeq
#'    \enumerate{
#'    \item  EDASeq::newSeqExpressionSet
#'    \item  EDASeq::withinLaneNormalization
#'    \item  EDASeq::betweenLaneNormalization
#'    \item  EDASeq::counts
#'    }
#' @param tabDF Rnaseq numeric matrix, each row represents a gene,
#' each column represents a sample
#' @param geneInfo Information matrix of 20531 genes about geneLength and gcContent.
#' Two objects are provided: TCGAbiolinks::geneInfoHT,TCGAbiolinks::geneInfo
#' @param method is method of normalization such as 'gcContent' or 'geneLength'
#' @export
#' @return Rnaseq matrix normalized with counts slot holds the count data as a matrix
#' of non-negative integer count values, one row for each observational unit (gene or the like),
#' and one column for each sample.
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
TCGAanalyze_Normalization <-
    function(tabDF, geneInfo, method = "geneLength") {

        check_package("EDASeq")

        # Check if we have a SE, we need a gene expression matrix
        if (is(tabDF, "SummarizedExperiment")) tabDF <- assay(tabDF)

        geneInfo <- geneInfo[!is.na(geneInfo[, 1]), ]
        geneInfo <- as.data.frame(geneInfo)
        geneInfo$geneLength <-
            as.numeric(as.character(geneInfo$geneLength))
        geneInfo$gcContent <- as.numeric(as.character(geneInfo$gcContent))


        if (method == "gcContent") {
            tmp <- as.character(rownames(tabDF))
            tmp <- strsplit(tmp, "\\|")
            geneNames <- matrix("", ncol = 2, nrow = length(tmp))
            j <- 1
            while (j <= length(tmp)) {
                geneNames[j, 1] <- tmp[[j]][1]
                geneNames[j, 2] <- tmp[[j]][2]
                j <- j + 1
            }
            tmp <- which(geneNames[, 1] == "?")
            geneNames[tmp, 1] <- geneNames[tmp, 2]
            tmp <- table(geneNames[, 1])
            tmp <- which(geneNames[, 1] %in% names(tmp[which(tmp > 1)]))
            geneNames[tmp, 1] <- paste(geneNames[tmp, 1], geneNames[tmp, 2], sep = ".")
            tmp <- table(geneNames[, 1])
            rownames(tabDF) <- geneNames[, 1]

            rawCounts <- tabDF
            commonGenes <-  intersect(rownames(geneInfo), rownames(rawCounts))
            geneInfo <- geneInfo[commonGenes, ]
            rawCounts <- rawCounts[commonGenes, ]

            timeEstimated <- format(ncol(tabDF) * nrow(tabDF) / 80000, digits = 2)
            message(
                messageEstimation <- paste(
                    "I Need about ",
                    timeEstimated,
                    "seconds for this Complete Normalization Upper Quantile",
                    " [Processing 80k elements /s]  "
                )
            )

            ffData  <- as.data.frame(geneInfo)
            rawCounts <- floor(rawCounts)

            message("Step 1 of 4: newSeqExpressionSet ...")
            tmp <- EDASeq::newSeqExpressionSet(as.matrix(rawCounts), featureData = ffData)

            #fData(tmp)[, "gcContent"] <- as.numeric(geneInfo[, "gcContent"])

            message("Step 2 of 4: withinLaneNormalization ...")
            tmp <- EDASeq::withinLaneNormalization(
                tmp, "gcContent",
                which = "upper",
                offset = TRUE
            )

            message("Step 3 of 4: betweenLaneNormalization ...")
            tmp <- EDASeq::betweenLaneNormalization(
                tmp,
                which = "upper",
                offset = TRUE
            )
            normCounts <-  log(rawCounts + .1) + EDASeq::offst(tmp)
            normCounts <-  floor(exp(normCounts) - .1)

            message("Step 4 of 4: .quantileNormalization ...")
            tmp <- t(.quantileNormalization(t(normCounts)))
            tabDF_norm <- floor(tmp)
        }

        if (method == "geneLength") {
            tabDF <- tabDF[!(GenesCutID(as.matrix(rownames(tabDF))) == "?"), ]
            tabDF <- tabDF[!(GenesCutID(as.matrix(rownames(tabDF))) == "SLC35E2"), ]
            rownames(tabDF) <- GenesCutID(as.matrix(rownames(tabDF)))
            tabDF <- tabDF[rownames(tabDF) != "?",]
            tabDF <- tabDF[!duplicated(rownames(tabDF)), !duplicated(colnames(tabDF))]
            tabDF <- tabDF[rownames(tabDF) %in% rownames(geneInfo), ]
            tabDF <- as.matrix(tabDF)

            geneInfo <-
                geneInfo[rownames(geneInfo) %in% rownames(tabDF),]
            geneInfo <- geneInfo[!duplicated(rownames(geneInfo)),]
            toKeep <- which(geneInfo[, "geneLength"] != 0)
            geneInfo <- geneInfo[toKeep,]
            tabDF <- tabDF[toKeep,]
            geneInfo <- as.data.frame(geneInfo)
            tabDF <- round(tabDF)
            commonGenes <- intersect(rownames(tabDF), rownames(geneInfo))

            tabDF <- tabDF[commonGenes, ]
            geneInfo <- geneInfo[commonGenes, ]

            timeEstimated <-
                format(ncol(tabDF) * nrow(tabDF) / 80000, digits = 2)
            message(
                messageEstimation <- paste(
                    "I Need about ",
                    timeEstimated,
                    "seconds for this Complete Normalization Upper Quantile",
                    " [Processing 80k elements /s]  "
                )
            )

            message("Step 1 of 4: newSeqExpressionSet ...")
            system.time(tabDF_norm <-
                            EDASeq::newSeqExpressionSet(tabDF, featureData = geneInfo))
            message("Step 2 of 4: withinLaneNormalization ...")
            system.time(
                tabDF_norm <-
                    EDASeq::withinLaneNormalization(
                        tabDF_norm,
                        "geneLength",
                        which = "upper",
                        offset = FALSE
                    )
            )
            message("Step 3 of 4: betweenLaneNormalization ...")
            system.time(
                tabDF_norm <-
                    EDASeq::betweenLaneNormalization(tabDF_norm, which = "upper", offset = FALSE)
            )
            message("Step 4 of 4: exprs ...")

            #system.time(tabDF_norm <- EDASeq::exprs(tabDF_norm))
            system.time(tabDF_norm <- EDASeq::counts(tabDF_norm))
        }


        return(tabDF_norm)
    }


#' @title Differential expression analysis (DEA) using edgeR or limma package.
#' @description
#'    TCGAanalyze_DEA allows user to perform Differentially expression analysis (DEA),
#'    using edgeR package or limma to identify differentially expressed genes (DEGs).
#'     It is possible to do a two-class analysis.
#'
#'     TCGAanalyze_DEA performs DEA using following functions from edgeR:
#'     \enumerate{
#'     \item edgeR::DGEList converts the count matrix into an edgeR object.
#'     \item edgeR::estimateCommonDisp each gene gets assigned the same dispersion estimate.
#'     \item edgeR::exactTest performs pair-wise tests for differential expression between two groups.
#'     \item edgeR::topTags takes the output from exactTest(), adjusts the raw p-values using the
#'     False Discovery Rate (FDR) correction, and returns the top differentially expressed genes.
#'     }
#'     TCGAanalyze_DEA performs DEA using following functions from limma:
#'     \enumerate{
#'     \item limma::makeContrasts construct matrix of custom contrasts.
#'     \item limma::lmFit Fit linear model for each gene given a series of arrays.
#'     \item limma::contrasts.fit Given a linear model fit to microarray data, compute estimated coefficients and standard errors for a given set of contrasts.
#'     \item limma::eBayes Given a microarray linear model fit, compute moderated t-statistics, moderated F-statistic, and log-odds of differential expression by empirical Bayes moderation of the standard errors towards a common value.
#'     \item limma::toptable Extract a table of the top-ranked genes from a linear model fit.
#'     }
#' @param mat1 numeric matrix, each row represents a gene,
#' each column represents a sample with Cond1type
#' @param mat2 numeric matrix, each row represents a gene,
#' each column represents a sample with Cond2type
#' @param metadata Add metadata
#' @param Cond1type a string containing the class label of the samples in mat1
#'  (e.g., control group)
#' @param Cond2type a string containing the class label of the samples in mat2
#' (e.g., case group)
#' @param pipeline a string to specify which package to use ("limma" or "edgeR")
#' @param method is 'glmLRT' (1) or 'exactTest' (2) used for edgeR
#' (1) Fit a negative binomial generalized log-linear model to
#' the read counts for each gene
#' (2) Compute genewise exact tests for differences in the means between
#' two groups of negative-binomially distributed counts.
#' @param fdr.cut is a threshold to filter DEGs according their p-value corrected
#' @param logFC.cut is a threshold to filter DEGs according their logFC
#' @param elementsRatio is number of elements processed for second for time consumation estimation
#' @param batch.factors a vector containing strings to specify options for batch correction. Options are "Plate", "TSS", "Year", "Portion", "Center", and "Patients"
#' @param ClinicalDF a dataframe returned by GDCquery_clinic() to be used to extract year data
#' @param paired boolean to account for paired or non-paired samples. Set to TRUE for paired case
#' @param log.trans boolean to perform log cpm transformation. Set to TRUE for log transformation
#' @param trend boolean to perform limma-trend pipeline. Set to TRUE to go through limma-trend
#' @param MAT matrix containing expression set as all samples in columns and genes as rows. Do not provide if mat1 and mat2 are used
#' @param contrast.formula string input to determine coefficients and to design contrasts in a customized way
#' @param Condtypes vector of grouping for samples in MAT
#' @param voom boolean to perform voom transformation for limma-voom pipeline. Set to TRUE for voom transformation
#' @export
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut =  0.25)
#' samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
#' samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
#' dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,samplesNT],
#'                             mat2 = dataFilt[,samplesTP],
#'                             Cond1type = "Normal",
#'                             Cond2type = "Tumor")
#'
#' @return table with DEGs containing for each gene logFC, logCPM, pValue,and FDR, also for each contrast
TCGAanalyze_DEA <-
    function (mat1,
              mat2,
              metadata = TRUE,
              Cond1type,
              Cond2type,
              pipeline = "edgeR",
              method = "exactTest",
              fdr.cut = 1,
              logFC.cut = 0,
              elementsRatio = 30000,
              batch.factors = NULL,
              ClinicalDF = data.frame(),
              paired = FALSE,
              log.trans = FALSE,
              voom = FALSE,
              trend = FALSE,
              MAT = data.frame(),
              contrast.formula = "",
              Condtypes = c())
    {
        if (pipeline == "limma") {
            if (!requireNamespace("limma", quietly = TRUE)) {
                stop("limma is needed. Please install it.",
                     call. = FALSE)
            }
        }

        if (!requireNamespace("edgeR", quietly = TRUE)) {
            stop("edgeR is needed. Please install it.",
                 call. = FALSE)
        }

        table.code <- c(
            "TP",
            "TR",
            "TB",
            "TRBM",
            "TAP",
            "TM",
            "TAM",
            "THOC",
            "TBM",
            "NB",
            "NT",
            "NBC",
            "NEBV",
            "NBM",
            "CELLC",
            "TRB",
            "CELL",
            "XP",
            "XCL"
        )
        names(table.code) <- c(
            "01",
            "02",
            "03",
            "04",
            "05",
            "06",
            "07",
            "08",
            "09",
            "10",
            "11",
            "12",
            "13",
            "14",
            "20",
            "40",
            "50",
            "60",
            "61"
        )
        if (nrow(MAT) == 0) {
            TOC <- cbind(mat1, mat2)
            Cond1num <- ncol(mat1)
            Cond2num <- ncol(mat2)
        }
        else {
            TOC <- MAT
        }
        if (metadata == TRUE) {
            my_IDs <- get_IDs(TOC)
            Plate <- factor(my_IDs$plate)
            Condition <- factor(my_IDs$condition)
            TSS <- factor(my_IDs$tss)
            Portion <- factor(my_IDs$portion)
            Center <- factor(my_IDs$center)
            Patients <- factor(my_IDs$patient)
        }
        if (paired == TRUE) {
            matched.query <- TCGAquery_MatchedCoupledSampleTypes(my_IDs$barcode,
                                                                 table.code[unique(my_IDs$sample)])
            my_IDs <- subset(my_IDs, barcode == matched.query)
            TOC <- TOC[, (names(TOC) %in% matched.query)]
        }
        if (nrow(ClinicalDF) > 0) {
            names(ClinicalDF)[names(ClinicalDF) == "bcr_patient_barcode"] <-
                "patient"
            ClinicalDF$age_at_diag_year <-
                floor(clinical$age_at_diagnosis / 365)
            ClinicalDF$diag_year <- ClinicalDF$age_at_diag_year +
                clinical$year_of_birth
            diag_yearDF <- ClinicalDF[, c("patient", "diag_year")]
            my_IDs <- merge(my_IDs, ClinicalDF, by = "patient")
            Year <- as.factor(my_IDs$diag_year)
        }
        options <- c("Plate", "TSS", "Year", "Portion", "Center",
                     "Patients")
        if (length(batch.factors) == 0) {
            message("Batch correction skipped since no factors provided")
        }
        else
            for (o in batch.factors) {
                if (o %in% options == FALSE)
                    stop(paste0(o, " is not a valid batch correction factor"))
                if (o == "Year" & nrow(ClinicalDF) == 0)
                    stop(
                        "batch correction using diagnosis year needs clinical info. Provide Clinical Data in arguments"
                    )
            }
        additiveformula <- paste(batch.factors, collapse = "+")
        message("----------------------- DEA -------------------------------")
        if (nrow(MAT) == 0) {
            message(message1 <- paste(
                "there are Cond1 type",
                Cond1type,
                "in ",
                Cond1num,
                "samples"
            ))
            message(message2 <- paste(
                "there are Cond2 type",
                Cond2type,
                "in ",
                Cond2num,
                "samples"
            ))
            message(message3 <-
                        paste("there are ", nrow(TOC), "features as miRNA or genes "))
        }
        else {
            message(message3 <-
                        paste("there are ", nrow(TOC), "features as miRNA or genes "))
        }
        timeEstimated <- format(ncol(TOC) * nrow(TOC) / elementsRatio,
                                digits = 2)
        message(
            messageEstimation <- paste(
                "I Need about ",
                timeEstimated,
                "seconds for this DEA. [Processing 30k elements /s]  "
            )
        )
        colnames(TOC) <- paste0("s", 1:ncol(TOC))
        if (length(Condtypes) > 0) {
            tumorType <- factor(x = Condtypes, levels = unique(Condtypes))
        }
        else {
            tumorType <- factor(x = rep(c(Cond1type, Cond2type),
                                        c(Cond1num, Cond2num)),
                                levels = c(Cond1type, Cond2type))
        }
        if (length(batch.factors) == 0 & length(Condtypes) > 0) {
            if (pipeline == "edgeR")
                design <- model.matrix( ~ tumorType)
            else
                design <- model.matrix( ~ 0 + tumorType)
        } else if (length(batch.factors) == 0 &
                   length(Condtypes) == 0) {
            if (pipeline == "edgeR")
                design <- model.matrix( ~ tumorType)
            else
                design <- model.matrix( ~ 0 + tumorType)
        } else if (length(batch.factors) > 0 & length(Condtypes) == 0) {
            if (pipeline == "edgeR")
                formula <- paste0("~tumorType+", additiveformula)
            else
                formula <- paste0("~0+tumorType+", additiveformula)
            design <- model.matrix(eval(parse(text = formula)))
        } else if (length(batch.factors) > 0 & length(Condtypes) > 0) {
            if (pipeline == "edgeR") {
                formula <- paste0("~tumorType+", additiveformula)
                if (length(Condtypes) > 2)
                    formula <- paste0("~0+tumorType+", additiveformula)
            }
            else
                formula <- paste0("~0+tumorType+", additiveformula)
            design <- model.matrix(eval(parse(text = formula)))
        }

        if (pipeline == "edgeR") {
            if (method == "exactTest") {
                DGE <- edgeR::DGEList(TOC, group = rep(c(Cond1type,
                                                         Cond2type), c(Cond1num, Cond2num)))
                disp <- edgeR::estimateCommonDisp(DGE)
                tested <- edgeR::exactTest(disp, pair = c(Cond1type,
                                                          Cond2type))
                logFC_table <- tested$table
                tableDEA <-
                    edgeR::topTags(tested, n = nrow(tested$table))$table
                tableDEA <- tableDEA[tableDEA$FDR <= fdr.cut,]
                tableDEA <- tableDEA[abs(tableDEA$logFC) >= logFC.cut,]
            }
            else if (method == "glmLRT") {
                if (length(unique(tumorType)) == 2) {
                    aDGEList <- edgeR::DGEList(counts = TOC, group = tumorType)
                    aDGEList <-
                        edgeR::estimateGLMCommonDisp(aDGEList, design)
                    aDGEList <-
                        edgeR::estimateGLMTagwiseDisp(aDGEList, design)
                    aGlmFit <-
                        edgeR::glmFit(
                            aDGEList,
                            design,
                            dispersion = aDGEList$tagwise.dispersion,
                            prior.count.total = 0
                        )


                    aGlmLRT <- edgeR::glmLRT(aGlmFit, coef = 2)
                    tableDEA <-
                        cbind(aGlmLRT$table,
                              FDR = p.adjust(aGlmLRT$table$PValue, "fdr"))
                    tableDEA <- tableDEA[tableDEA$FDR < fdr.cut, ]
                    tableDEA <-
                        tableDEA[abs(tableDEA$logFC) > logFC.cut, ]
                    if (all(grepl("ENSG", rownames(tableDEA))))
                        tableDEA <-
                        cbind(tableDEA, map.ensg(genes = rownames(tableDEA))[, 2:3])
                }
                else if (length(unique(tumorType)) > 2) {
                    aDGEList <- edgeR::DGEList(counts = TOC, group = tumorType)
                    colnames(design)[1:length(levels(tumorType))] <-
                        levels(tumorType)
                    prestr = "makeContrasts("
                    poststr = ",levels=design)"
                    commandstr = paste(prestr, contrast.formula,
                                       poststr, sep = "")
                    commandstr = paste0("limma::", commandstr)
                    cont.matrix <- eval(parse(text = commandstr))
                    aDGEList <- edgeR::estimateGLMCommonDisp(aDGEList,
                                                             design)
                    aDGEList <- edgeR::estimateGLMTagwiseDisp(aDGEList,
                                                              design)

                    aGlmFit <-
                        edgeR::glmFit(
                            aDGEList,
                            design,
                            dispersion = aDGEList$tagwise.dispersion,
                            prior.count.total = 0
                        )

                    tableDEA <- list()
                    for (mycoef in colnames(cont.matrix)) {
                        message(paste0("DEA for", " :", mycoef))
                        aGlmLRT <-
                            edgeR::glmLRT(aGlmFit, contrast = cont.matrix[, mycoef])
                        #print("---toptags---")
                        #print(topTags(aGlmLRT, adjust.method = "fdr",
                        #  sort.by = "PValue"))
                        tt <- aGlmLRT$table
                        tt <-
                            cbind(tt, FDR = p.adjust(aGlmLRT$table$PValue,
                                                     "fdr"))
                        tt <-
                            tt[(tt$FDR < fdr.cut & abs(as.numeric(tt$logFC)) >
                                    logFC.cut),]
                        tableDEA[[as.character(mycoef)]] <- tt
                        if (all(grepl("ENSG", rownames(tableDEA[[as.character(mycoef)]]))))
                            tableDEA[[as.character(mycoef)]] <-
                            cbind(tableDEA[[as.character(mycoef)]],
                                  map.ensg(genes = rownames(tableDEA[[as.character(mycoef)]]))[,
                                                                                               2:3])
                    }
                }
            }
            else
                stop(
                    paste0(
                        method,
                        " is not a valid DEA method option. Choose 'exactTest' or 'glmLRT' "
                    )
                )
        }
        else if (pipeline == "limma") {
            if (log.trans == TRUE)
                logCPM <- edgeR::cpm(TOC, log = TRUE, prior.count = 3)
            else
                logCPM <- TOC
            if (voom == TRUE) {
                message("Voom Transformation...")
                logCPM <- limma::voom(logCPM, design)
            }
            if (length(unique(tumorType)) == 2) {
                colnames(design)[1:2] <- c(Cond1type, Cond2type)
                contr <- paste0(Cond2type, "-", Cond1type)
                cont.matrix <- limma::makeContrasts(contrasts = contr,
                                                    levels = design)
                fit <- limma::lmFit(logCPM, design)
                fit <- limma::contrasts.fit(fit, cont.matrix)
                if (trend == TRUE) {
                    fit <- limma::eBayes(fit, trend = TRUE)
                }
                else {
                    fit <- limma::eBayes(fit, trend = FALSE)
                }
                tableDEA <-
                    limma::topTable(
                        fit,
                        coef = 1,
                        adjust.method = "fdr",
                        number = nrow(TOC)
                    )
                limma::volcanoplot(fit, highlight = 10)
                index <- which(tableDEA[, 4] < fdr.cut)
                tableDEA <- tableDEA[index,]
                neg_logFC.cut <- -1 * logFC.cut
                index <- which(abs(as.numeric(tableDEA$logFC)) >
                                   logFC.cut)
                tableDEA <- tableDEA[index,]
            }
            else if (length(unique(tumorType)) > 2) {
                DGE <- edgeR::DGEList(TOC, group = tumorType)
                colnames(design)[1:length(levels(tumorType))] <-
                    levels(tumorType)
                prestr = "makeContrasts("
                poststr = ",levels=colnames(design))"
                commandstr = paste(prestr, contrast.formula, poststr,
                                   sep = "")
                commandstr = paste0("limma::", commandstr)
                cont.matrix <- eval(parse(text = commandstr))
                fit <- limma::lmFit(logCPM, design)
                fit <- limma::contrasts.fit(fit, cont.matrix)
                if (trend == TRUE)
                    fit <- limma::eBayes(fit, trend = TRUE)
                else
                    fit <- limma::eBayes(fit, trend = FALSE)
                tableDEA <- list()
                for (mycoef in colnames(cont.matrix)) {
                    tableDEA[[as.character(mycoef)]] <- limma::topTable(
                        fit,
                        coef = mycoef,
                        adjust.method = "fdr",
                        number = nrow(MAT)
                    )
                    message(paste0("DEA for", " :", mycoef))
                    tempDEA <- tableDEA[[as.character(mycoef)]]
                    index.up <- which(tempDEA$adj.P.Val < fdr.cut &
                                          abs(as.numeric(tempDEA$logFC)) > logFC.cut)
                    tableDEA[[as.character(mycoef)]] <-
                        tempDEA[index.up,]
                    if (all(grepl("ENSG", rownames(tableDEA[[as.character(mycoef)]]))))
                        tableDEA[[as.character(mycoef)]] <-
                        cbind(tableDEA[[as.character(mycoef)]],
                              map.ensg(genes = rownames(tableDEA[[as.character(mycoef)]]))[,
                                                                                           2:3])
                }
            }
        }
        else
            stop(paste0(
                pipeline,
                " is not a valid pipeline option. Choose 'edgeR' or 'limma'"
            ))
        message("----------------------- END DEA -------------------------------")
        return(tableDEA)
    }


#' @title Batch correction using ComBat and Voom transformation using limma package.
#' @description
#'    TCGAbatch_correction allows user to perform a Voom correction on gene expression data and have it ready for DEA.
#'    One can also use ComBat for batch correction for exploratory analysis. If batch.factor or adjustment argument is "Year"
#'  please provide clinical data. If no batch factor is provided, the data will be voom corrected only
#'
#'     TCGAanalyze_DEA performs DEA using following functions from sva and limma:
#'     \enumerate{
#'     \item limma::voom Transform RNA-Seq Data Ready for Linear Modelling.
#'     \item sva::ComBat Adjust for batch effects using an empirical Bayes framework.
#'     }
#' @param tabDF numeric matrix, each row represents a gene,
#' each column represents a sample
#' @param batch.factor a string containing the batch factor to use for correction. Options are "Plate", "TSS", "Year", "Portion", "Center"
#' @param adjustment vector containing strings for factors to adjust for using ComBat. Options are "Plate", "TSS", "Year", "Portion", "Center"
#' @param UnpublishedData if TRUE perform a batch correction after adding new data
#' @param ClinicalDF a dataframe returned by GDCquery_clinic() to be used to extract year data
#' @param AnnotationDF a dataframe with column Batch indicating different batches of the samples in the tabDF
#' @export
#' @return data frame with ComBat batch correction applied
TCGAbatch_Correction <- function (tabDF,
                                  batch.factor = NULL,
                                  adjustment = NULL,
                                  ClinicalDF = data.frame(),
                                  UnpublishedData = FALSE,
                                  AnnotationDF = data.frame())

{
    if (!requireNamespace("sva", quietly = TRUE)) {
        stop("sva is needed. Please install it.",
             call. = FALSE)
    }
    if (UnpublishedData == TRUE) {
        batch.factor <- as.factor(AnnotationDF$Batch)
        batch_corr <-
            sva::ComBat(
                dat = tabDF,
                batch = batch.factor,
                par.prior = TRUE,
                prior.plots = TRUE
            )
    }

    if (UnpublishedData == FALSE) {
        if (length(batch.factor) == 0 & length(adjustment) == 0)
            message("batch correction will be skipped")
        else if (batch.factor %in% adjustment) {
            stop(paste0("Cannot adjust and correct for the same factor|"))
        }
        my_IDs <- get_IDs(tabDF)
        if (length(batch.factor) > 0 || length(adjustment) > 0)
            if ((nrow(ClinicalDF) > 0 & batch.factor == "Year") ||
                ("Year" %in% adjustment == TRUE & nrow(ClinicalDF) >
                 0)) {
                names(ClinicalDF)[names(ClinicalDF) == "bcr_patient_barcode"] <-
                    "patient"
                ClinicalDF$age_at_diag_year <-
                    floor(ClinicalDF$age_at_diagnosis / 365)
                ClinicalDF$diag_year <-
                    ClinicalDF$age_at_diag_year +
                    ClinicalDF$year_of_birth
                diag_yearDF <-
                    ClinicalDF[, c("patient", "diag_year")]
                Year <- merge(my_IDs, diag_yearDF, by = "patient")
                Year <- Year$diag_year
                Year <- as.factor(Year)
            }
        else if (nrow(ClinicalDF) == 0 & batch.factor == "Year") {
            stop("Cannot extract Year data. Clinical data was not provided")
        }
        Plate <- as.factor(my_IDs$plate)
        Condition <- as.factor(my_IDs$condition)
        TSS <- as.factor(my_IDs$tss)
        Portion <- as.factor(my_IDs$portion)
        Sequencing.Center <- as.factor(my_IDs$center)
        design.matrix <- model.matrix( ~ Condition)
        design.mod.combat <- model.matrix( ~ Condition)
        options <-
            c("Plate", "TSS", "Year", "Portion", "Sequencing Center")

        if (length(batch.factor) > 1)
            stop("Combat can only correct for one batch variable. Provide one batch factor")
        if (batch.factor %in% options == FALSE)
            stop(paste0(o, " is not a valid batch correction factor"))

        for (o in adjustment) {
            if (o %in% options == FALSE)
                stop(paste0(o, " is not a valid adjustment factor"))

        }
        adjustment.data <- c()
        for (a in adjustment) {
            if (a == "Sequencing Center")
                a <- Sequencing.Center
            adjustment.data <-
                cbind(eval(parse(text = a)), adjustment.data)
        }
        if (batch.factor == "Sequencing Center")
            batch.factor <- Sequencing.Center
        batchCombat <- eval(parse(text = batch.factor))
        if (length(adjustment) > 0) {
            adjustment.formula <- paste(adjustment, collapse = "+")
            adjustment.formula <- paste0("+", adjustment.formula)
            adjustment.formula <-
                paste0("~Condition", adjustment.formula)
            print(adjustment.formula)
            model <-
                data.frame(batchCombat, row.names = colnames(tabDF))
            design.mod.combat <-
                model.matrix(eval(parse(text = adjustment.formula)),
                             data = model)
        }
        print(unique(batchCombat))
        batch_corr <- sva::ComBat(
            dat = tabDF,
            batch = batchCombat,
            mod = design.mod.combat,
            par.prior = TRUE,
            prior.plots = TRUE
        )
    }

    return(batch_corr)
}

##Function to take raw counts by removing rows filtered after norm and filter process###

#' @title Use raw count from the DataPrep object which genes are removed by normalization and filtering steps.
#' @description function to keep raw counts after filtering and/or normalizing.
#' @param DataPrep DataPrep object returned by TCGAanalyze_Preprocessing()
#' @param DataFilt Filtered data frame containing samples in columns and genes in rows after normalization and/or filtering steps
#' @examples
#' \dontrun{
#'   dataPrep_raw <- UseRaw_afterFilter(dataPrep, dataFilt)
#' }
#' @export
#' @return Filtered return object similar to DataPrep with genes removed after normalization and filtering process.
UseRaw_afterFilter <- function(DataPrep, DataFilt) {
    rownames(DataPrep) <-
        lapply(rownames(DataPrep), function(x)
            gsub("[[:punct:]]\\d*", "", x))
    filtered.list <- setdiff(rownames(DataPrep), rownames(DataFilt))
    Res <- DataPrep[!rownames(DataPrep) %in% filtered.list,]
    return(Res)
}

#' @title Adding information related to DEGs genes from DEA as mean values in two conditions.
#' @description
#'    TCGAanalyze_LevelTab allows user to add information related to DEGs genes from
#'    Differentially expression analysis (DEA) such as mean values and in two conditions.
#' @param FC_FDR_table_mRNA Output of dataDEGs filter by abs(LogFC) >=1
#' @param typeCond1 a string containing the class label of the samples
#'  in TableCond1  (e.g., control group)
#' @param typeCond2 a string containing the class label of the samples
#' in TableCond2  (e.g., case group)
#' @param TableCond1 numeric matrix, each row represents a gene, each column
#'  represents a sample with Cond1type
#' @param TableCond2 numeric matrix, each row represents a gene, each column
#' represents a sample with Cond2type
#' @param typeOrder typeOrder
#' @export
#' @return table with DEGs, log Fold Change (FC), false discovery rate (FDR),
#' the gene expression level
#' for samples in  Cond1type, and Cond2type, and Delta value (the difference
#' of gene expression between the two
#' conditions multiplied logFC)
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut =  0.25)
#' samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
#' samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
#' dataDEGs <- TCGAanalyze_DEA(dataFilt[,samplesNT],
#'                             dataFilt[,samplesTP],
#'                             Cond1type = "Normal",
#'                             Cond2type = "Tumor")
#' dataDEGsFilt <- dataDEGs[abs(dataDEGs$logFC) >= 1,]
#' dataTP <- dataFilt[,samplesTP]
#' dataTN <- dataFilt[,samplesNT]
#' dataDEGsFiltLevel <- TCGAanalyze_LevelTab(dataDEGsFilt,"Tumor","Normal",
#' dataTP,dataTN)
TCGAanalyze_LevelTab <- function(FC_FDR_table_mRNA,
                                 typeCond1,
                                 typeCond2,
                                 TableCond1,
                                 TableCond2,
                                 typeOrder = TRUE) {
    TF_enriched <- as.matrix(rownames(FC_FDR_table_mRNA))
    TableLevel <- matrix(0, nrow(TF_enriched), 6)
    TableLevel <- as.data.frame(TableLevel)

    colnames(TableLevel) <-
        c("mRNA", "logFC", "FDR", typeCond1, typeCond2, "Delta")


    TableLevel[, "mRNA"] <- TF_enriched
    Tabfilt <-
        FC_FDR_table_mRNA[which(rownames(FC_FDR_table_mRNA) %in%
                                    TF_enriched), ]
    TableLevel[, "logFC"] <-
        as.numeric(Tabfilt[TF_enriched, ][, "logFC"])
    TableLevel[, "FDR"] <- as.numeric(Tabfilt[TF_enriched, ][, "FDR"])


    MeanTumor <- matrix(0, nrow(TF_enriched), 1)
    MeanDiffTumorNormal <- matrix(0, nrow(TF_enriched), 1)

    for (i in 1:nrow(TF_enriched)) {
        TableLevel[i, typeCond1] <-
            mean(as.numeric(TableCond1[rownames(TableCond1) %in%
                                           TF_enriched[i] ,]))
        TableLevel[i, typeCond2] <-
            mean(as.numeric(TableCond2[rownames(TableCond2) %in%
                                           TF_enriched[i] ,]))
    }


    TableLevel[, "Delta"] <- as.numeric(abs(TableLevel[, "logFC"]) *
                                            TableLevel[, typeCond1])

    TableLevel <-
        TableLevel[order(as.numeric(TableLevel[, "Delta"]),
                         decreasing = typeOrder), ]

    rownames(TableLevel) <-  TableLevel[, "mRNA"]
    if (all(grepl("ENSG", rownames(TableLevel))))
        TableLevel <-
        cbind(TableLevel, map.ensg(genes = rownames(TableLevel))[, 2:3])
    return(TableLevel)
}

#' @title Enrichment analysis for Gene Ontology (GO) [BP,MF,CC] and Pathways
#' @description
#'   Researchers, in order to better understand the underlying biological
#'   processes, often want to retrieve a functional profile of a set of genes
#'   that might have an important role. This can be done by performing an
#'   enrichment analysis.
#'
#'We will perform an enrichment analysis on gene sets using the TCGAanalyze_EAcomplete
#'function. Given a set of genes that are
#'up-regulated under certain conditions, an enrichment analysis will find
#'identify classes of genes or proteins that are #'over-represented using
#'annotations for that gene set.
#' @param TFname is the name of the list of genes or TF's regulon.
#' @param RegulonList List of genes such as TF's regulon or DEGs where to find enrichment.
#' @export
#' @return Enrichment analysis GO[BP,MF,CC] and Pathways complete table enriched by genelist.
#' @examples
#' Genelist <- c("FN1","COL1A1")
#' ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist)
#' \dontrun{
#' Genelist <- rownames(dataDEGsFiltLevel)
#' system.time(ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist))
#' }
TCGAanalyze_EAcomplete <- function(TFname, RegulonList) {
    # This is a verification of the input
    # in case the List is like Gene|ID
    # we will get only the Gene
    if (all(grepl("\\|", RegulonList))) {
        RegulonList <- strsplit(RegulonList, "\\|")
        RegulonList <- unlist(lapply(RegulonList, function(x)
            x[1]))
    }

    print(
        paste(
            "I need about ",
            "1 minute to finish complete ",
            "Enrichment analysis GO[BP,MF,CC] and Pathways... "
        )
    )

    ResBP <- TCGAanalyze_EA(TFname, RegulonList, DAVID_BP_matrix,
                            EAGenes, GOtype = "DavidBP")
    print("GO Enrichment Analysis BP completed....done")
    ResMF <- TCGAanalyze_EA(TFname, RegulonList, DAVID_MF_matrix,
                            EAGenes, GOtype = "DavidMF")
    print("GO Enrichment Analysis MF completed....done")
    ResCC <- TCGAanalyze_EA(TFname, RegulonList, DAVID_CC_matrix,
                            EAGenes, GOtype = "DavidCC")
    print("GO Enrichment Analysis CC completed....done")
    ResPat <- TCGAanalyze_EA(TFname, RegulonList, listEA_pathways,
                             EAGenes, GOtype = "Pathway")
    print("Pathway Enrichment Analysis completed....done")

    ans <-
        list(
            ResBP = ResBP,
            ResMF = ResMF,
            ResCC = ResCC,
            ResPat = ResPat
        )
    return(ans)
}

#' @title Enrichment analysis of a gene-set with GO [BP,MF,CC]  and pathways.
#' @description
#' The rational behind a enrichment analysis ( gene-set, pathway etc) is to compute
#' statistics of whether the overlap between the focus list (signature) and the gene-set
#' is significant. ie the confidence that overlap between the list is not due to chance.
#'  The Gene Ontology project describes genes (gene products) using terms from
#'  three structured vocabularies: biological process, cellular component and molecular function.
#'  The Gene Ontology Enrichment component, also referred to as the GO Terms" component, allows
#'  the genes in any such "changed-gene" list to be characterized using the Gene Ontology terms
#'  annotated to them. It asks, whether for any particular GO term, the fraction of genes
#'  assigned to it in the "changed-gene" list is higher than expected by chance
#'  (is over-represented), relative to the fraction of genes assigned to that term in the
#'  reference set.
#'  In statistical terms it perform the analysis tests the null hypothesis that,
#'  for any particular ontology term, there is no difference in the proportion of genes
#'  annotated to it in the reference list and the proportion annotated to it in the test list.
#'  We adopted a Fisher Exact Test to perform the EA.
#' @param GeneName is the name of gene signatures list
#' @param TableEnrichment is a table related to annotations of gene symbols such as
#' GO[BP,MF,CC] and Pathways. It was created from DAVID gene ontology on-line.
#' @param RegulonList is a gene signature (lisf of genes) in which perform EA.
#' @param GOtype is type of gene ontology Biological process (BP), Molecular Function (MF),
#' Cellular componet (CC)
#' @param FDRThresh pvalue corrected (FDR) as threshold to selected significant
#' BP, MF,CC, or pathways. (default FDR < 0.01)
#' @param EAGenes is a table with informations about genes
#' such as ID, Gene, Description, Location and Family.
#' @param GeneSymbolsTable if it is TRUE will return a table with GeneSymbols in common GO or pathways.
# @export
#' @import stats
#' @return Table with enriched GO or pathways by selected gene signature.
#' @examples
#' \dontrun{
#' EAGenes <- get("EAGenes")
#' RegulonList <- rownames(dataDEGsFiltLevel)
#' ResBP <- TCGAanalyze_EA(GeneName="DEA genes Normal Vs Tumor",
#'                            RegulonList,DAVID_BP_matrix,
#'                            EAGenes,GOtype = "DavidBP")
#'}
TCGAanalyze_EA <-
    function (GeneName,
              RegulonList,
              TableEnrichment,
              EAGenes,
              GOtype,
              FDRThresh = 0.01,
              GeneSymbolsTable = FALSE)
    {
        topPathways <- nrow(TableEnrichment)
        topPathways_tab <- matrix(0, 1, topPathways)
        topPathways_tab <- as.matrix(topPathways_tab)
        rownames(topPathways_tab) <- GeneName
        rownames(EAGenes) <- toupper(rownames(EAGenes))
        EAGenes <- EAGenes[!duplicated(EAGenes[, "ID"]),]
        rownames(EAGenes) <- EAGenes[, "ID"]
        allgene <- EAGenes[, "ID"]
        current_pathway_from_EA <- as.matrix(TableEnrichment[, GOtype])
        TableNames <- gsub("David",
                           "",
                           paste("Top ", GOtype, " n. ",
                                 1:topPathways, " of ", topPathways, sep = ""))
        colnames(topPathways_tab) <- TableNames
        topPathways_tab <- as.data.frame(topPathways_tab)
        table_pathway_enriched <-
            matrix(1, nrow(current_pathway_from_EA),
                   8)
        colnames(table_pathway_enriched) <-
            c(
                "Pathway",
                "GenesInPathway",
                "Pvalue",
                "FDR",
                "CommonGenesPathway",
                "PercentPathway",
                "PercentRegulon",
                "CommonGeneSymbols"
            )
        table_pathway_enriched <- as.data.frame(table_pathway_enriched)
        for (i in 1:nrow(current_pathway_from_EA)) {
            table_pathway_enriched[i, "Pathway"] <-
                as.character(current_pathway_from_EA[i,])
            if (nrow(TableEnrichment) == 589) {
                genes_from_current_pathway_from_EA <-
                    GeneSplitRegulon(TableEnrichment[TableEnrichment[GOtype] ==
                                                         as.character(current_pathway_from_EA[i,]),][,
                                                                                                     "Molecules"], ",")
            }
            else {
                genes_from_current_pathway_from_EA <-
                    GeneSplitRegulon(TableEnrichment[TableEnrichment[GOtype] ==
                                                         as.character(current_pathway_from_EA[i,]),][,
                                                                                                     "Molecules"], ", ")
            }
            genes_common_pathway_TFregulon <-
                as.matrix(intersect(
                    toupper(RegulonList),
                    toupper(genes_from_current_pathway_from_EA)
                ))
            if (length(genes_common_pathway_TFregulon) != 0) {
                current_pathway_commongenes_num <-
                    length(genes_common_pathway_TFregulon)
                seta <- allgene %in% RegulonList
                setb <- allgene %in% genes_from_current_pathway_from_EA
                ft <- fisher.test(seta, setb)
                FisherpvalueTF <- ft$p.value
                table_pathway_enriched[i, "Pvalue"] <-
                    as.numeric(FisherpvalueTF)
                if (FisherpvalueTF < 0.01) {
                    current_pathway_commongenes_percent <- paste("(",
                                                                 format((
                                                                     current_pathway_commongenes_num / length(genes_from_current_pathway_from_EA)
                                                                 ) *
                                                                     100,
                                                                 digits = 2
                                                                 ), "%)")
                    current_pathway_commongenes_num_with_percent <-
                        gsub(
                            " ",
                            "",
                            paste(
                                current_pathway_commongenes_num,
                                current_pathway_commongenes_percent,
                                "pv=",
                                format(FisherpvalueTF, digits = 2)
                            )
                        )
                    table_pathway_enriched[i, "CommonGenesPathway"] <-
                        length(genes_common_pathway_TFregulon)
                    table_pathway_enriched[i, "CommonGeneSymbols"] <-
                        paste0(genes_common_pathway_TFregulon, collapse = ";")
                    table_pathway_enriched[i, "GenesInPathway"] <-
                        length(genes_from_current_pathway_from_EA)
                    table_pathway_enriched[i, "PercentPathway"] <-
                        as.numeric(table_pathway_enriched[i,
                                                          "CommonGenesPathway"]) /
                        as.numeric(table_pathway_enriched[i,
                                                          "GenesInPathway"]) * 100
                    table_pathway_enriched[i, "PercentRegulon"] <-
                        as.numeric(table_pathway_enriched[i,
                                                          "CommonGenesPathway"]) /
                        length(RegulonList) *
                        100
                }
            }
        }
        table_pathway_enriched <-
            table_pathway_enriched[order(table_pathway_enriched[,
                                                                "Pvalue"], decreasing = FALSE),]
        table_pathway_enriched <-
            table_pathway_enriched[table_pathway_enriched[,
                                                          "Pvalue"] < 0.01,]
        table_pathway_enriched[, "FDR"] <-
            p.adjust(table_pathway_enriched[,
                                            "Pvalue"], method = "fdr")
        table_pathway_enriched <-
            table_pathway_enriched[table_pathway_enriched[,
                                                          "FDR"] < FDRThresh,]
        table_pathway_enriched <-
            table_pathway_enriched[order(table_pathway_enriched[,
                                                                "FDR"], decreasing = FALSE),]

        table_pathway_enriched_filt <-
            table_pathway_enriched[table_pathway_enriched$FDR < FDRThresh, ]

        if (nrow(table_pathway_enriched) > 0) {
            tmp <- table_pathway_enriched
            tmp <- paste(
                tmp[, "Pathway"],
                "; FDR= ",
                format(tmp[,
                           "FDR"], digits = 3),
                "; (ng=",
                round(tmp[, "GenesInPathway"]),
                "); (ncommon=",
                format(tmp[, "CommonGenesPathway"],
                       digits = 2),
                ")",
                sep = ""
            )
            tmp <- as.matrix(tmp)
            topPathways_tab <-
                topPathways_tab[, 1:nrow(table_pathway_enriched),
                                drop = FALSE]
            topPathways_tab[1,] <- tmp
        }
        else {
            topPathways_tab <- NA
        }

        if (GeneSymbolsTable == FALSE) {
            return(topPathways_tab)
        }

        if (GeneSymbolsTable == TRUE) {
            return(table_pathway_enriched_filt)
        }
    }

#' @title Differentially expression analysis (DEA) using limma package.
#' @description Differentially expression analysis (DEA) using limma package.
#' @param FC.cut write
#' @param AffySet A matrix-like data object containing log-ratios or log-expression values
#' for a series of arrays, with rows corresponding to genes and columns to samples
#' @examples
#' \dontrun{
#' to add example
#' }
#' @export
#' @return List of list with tables in 2 by 2 comparison
#' of the top-ranked genes from a linear model fitted by DEA's limma
TCGAanalyze_DEA_Affy <- function(AffySet, FC.cut = 0.01) {
    if (!requireNamespace("Biobase", quietly = TRUE)) {
        stop("Biobase package is needed for this function to work. Please install it.",
             call. = FALSE)
    }
    if (!requireNamespace("limma", quietly = TRUE)) {
        stop("limma package is needed for this function to work. Please install it.",
             call. = FALSE)
    }
    Pdatatable <- Biobase::phenoData(AffySet)

    f <- factor(Pdatatable$Disease)
    groupColors <- names(table(f))

    tmp <- matrix(0, length(groupColors), length(groupColors))
    colnames(tmp) <- groupColors
    rownames(tmp) <- groupColors
    tmp[upper.tri(tmp)] <- 1


    sample_tab <- Pdatatable
    f <- factor(Pdatatable$Disease)
    design <- model.matrix( ~ 0 + f)
    colnames(design) <- levels(f)
    fit <-
        limma::lmFit(AffySet, design) ## fit is an object of class MArrayLM.

    groupColors <- names(table(Pdatatable$Disease))

    CompleteList <- vector("list", sum(tmp))

    k <- 1

    for (i in 1:length(groupColors)) {
        col1 <- colnames(tmp)[i]
        for (j in 1:length(groupColors)) {
            col2 <- rownames(tmp)[j]

            if (i != j) {
                if (tmp[i, j] != 0) {
                    Comparison <- paste(col2, "-", col1, sep = "")

                    if (i == 4 &&
                        j == 6) {
                        Comparison <- paste(col1, "-", col2, sep = "")
                    }
                    if (i == 5 &&
                        j == 6) {
                        Comparison <- paste(col1, "-", col2, sep = "")
                    }

                    print(paste(i, j, Comparison, "to do..."))

                    cont.matrix <-
                        limma::makeContrasts(I = Comparison, levels = design)

                    fit2 <- limma::contrasts.fit(fit, cont.matrix)
                    fit2 <- limma::eBayes(fit2)



                    sigI <-
                        limma::topTable(
                            fit2,
                            coef = 1,
                            adjust.method = "BH",
                            sort.by = "B",
                            p.value = 0.05,
                            lfc = FC.cut,
                            number = 50000
                        )

                    sigIbis <-
                        sigI[order(abs(as.numeric(sigI$logFC)), decreasing = TRUE), ]
                    names(CompleteList)[k] <- gsub("-", "_", Comparison)
                    CompleteList[[k]] <- sigIbis
                    k <- k + 1
                }
            }
        }

    }

    return(CompleteList)
}


#' @title Generate network
#' @description TCGAanalyze_analyseGRN perform gene regulatory network.
#' @param TFs a vector of genes.
#' @param normCounts is a matrix of gene expression with genes in rows and samples in columns.
#' @param kNum the number of nearest neighbors to consider to estimate the mutual information.
#' Must be less than the number of columns of normCounts.
#' @export
#' @return an adjacent matrix
TCGAanalyze_analyseGRN <- function(TFs, normCounts, kNum) {
    if (!requireNamespace("parmigene", quietly = TRUE)) {
        stop(
            "parmigene package is needed for this function to work. Please install it.",
            call. = FALSE
        )
    }
    MRcandidates <- intersect(rownames(normCounts), TFs)

    # Mutual information between TF and genes
    sampleNames <- colnames(normCounts)
    geneNames <- rownames(normCounts)

    messageMI_TFgenes <-
        paste(
            "Estimation of MI among [",
            length(MRcandidates),
            " TRs and ",
            nrow(normCounts),
            " genes].....",
            sep = ""
        )
    timeEstimatedMI_TFgenes1 <-
        length(MRcandidates) * nrow(normCounts) / 1000
    timeEstimatedMI_TFgenes <-
        format(timeEstimatedMI_TFgenes1 * ncol(normCounts) / 17000,
               digits = 2)
    messageEstimation <-
        print(
            paste(
                "I Need about ",
                timeEstimatedMI_TFgenes,
                "seconds for this MI estimation. [Processing 17000k elements /s]  "
            )
        )

    system.time(miTFGenes <-
                    parmigene::knnmi.cross(normCounts[MRcandidates,], normCounts, k = kNum))

    return(miTFGenes)

}

#' @title Generate pathview graph
#' @description TCGAanalyze_Pathview pathway based data integration and visualization.
#' @param dataDEGs dataDEGs
#' @param pathwayKEGG pathwayKEGG
#' @export
#' @return an adjacent matrix
#' @examples
#' \dontrun{
#'   dataDEGs <- data.frame(mRNA = c("TP53","TP63","TP73"), logFC = c(1,2,3))
#'   TCGAanalyze_Pathview(dataDEGs)
#' }
TCGAanalyze_Pathview <-
    function(dataDEGs, pathwayKEGG = "hsa05200") {
        if (!requireNamespace("clusterProfiler", quietly = TRUE)) {
            stop("clusterProfiler needed for this function to work. Please install it.",
                 call. = FALSE)
        }
        if (!requireNamespace("pathview", quietly = TRUE)) {
            stop("pathview needed for this function to work. Please install it.",
                 call. = FALSE)
        }
        # Converting Gene symbol to gene ID
        eg = as.data.frame(
            clusterProfiler::bitr(
                dataDEGs$mRNA,
                fromType = "SYMBOL",
                toType = "ENTREZID",
                OrgDb = "org.Hs.eg.db"
            )
        )
        eg <- eg[!duplicated(eg$SYMBOL), ]
        dataDEGs <- dataDEGs[dataDEGs$mRNA %in% eg$SYMBOL, ]
        dataDEGs <- dataDEGs[order(dataDEGs$mRNA, decreasing = FALSE), ]
        eg <- eg[order(eg$SYMBOL, decreasing = FALSE), ]
        dataDEGs$GeneID <- eg$ENTREZID
        dataDEGsFiltLevel_sub <-
            subset(dataDEGs, select = c("GeneID", "logFC"))
        genelistDEGs <- as.numeric(dataDEGsFiltLevel_sub$logFC)
        names(genelistDEGs) <- dataDEGsFiltLevel_sub$GeneID

        hsa05200 <- pathview::pathview(
            gene.data  = genelistDEGs,
            pathway.id = pathwayKEGG,
            species    = "hsa",
            limit      = list(gene = as.integer(max(
                abs(genelistDEGs)
            )))
        )

    }


#' @title infer gene regulatory networks
#' @description TCGAanalyze_networkInference taking expression data as input,
#' this will return an adjacency matrix of interactions
#' @param data expression data, genes in columns, samples in rows
#' @param optionMethod inference method, chose from aracne, c3net, clr and mrnet
#' @export
#' @return an adjacent matrix
TCGAanalyze_networkInference <-
    function(data, optionMethod = "clr") {
        # Converting Gene symbol to gene ID
        if (optionMethod == "c3net") {
            if (!requireNamespace("c3net", quietly = TRUE)) {
                stop(
                    "c3net package is needed for this function to work. Please install it.",
                    call. = FALSE
                )
            }

            net <- c3net::c3net(t(data))
        } else {
            if (!requireNamespace("minet", quietly = TRUE)) {
                stop(
                    "minet package is needed for this function to work. Please install it.",
                    call. = FALSE
                )
            }
            net <- minet::minet(data, method = optionMethod)
        }
        return(net)

    }


#' Creates a plot for GAIA output (all significant aberrant regions.)
#' @description
#' This function is a auxiliary function to visualize GAIA output
#' (all significant aberrant regions.)
#' @param calls A matrix with the following columns: Chromossome, Aberration Kind
#' Region Start, Region End, Region Size and score
#' @param threshold Score threshold (orange horizontal line in the plot)
#' @export
#' @importFrom graphics abline axis legend plot points
#' @return A plot with all significant aberrant regions.
#' @examples
#' call <- data.frame("Chromossome" = rep(9,100),
#'                    "Aberration Kind" = rep(c(-2,-1,0,1,2),20),
#'                    "Region Start [bp]" = 18259823:18259922,
#'                    "Region End [bp]" = 18259823:18259922,
#'                    "score" = rep(c(1,2,3,4),25))
#'  gaiaCNVplot(call,threshold = 0.01)
#'  call <- data.frame("Chromossome" = rep(c(1,9),50),
#'                     "Aberration Kind" = rep(c(-2,-1,0,1,2),20),
#'                     "Region Start [bp]" = 18259823:18259922,
#'                     "Region End [bp]" = 18259823:18259922,
#'                     "score" = rep(c(1,2,3,4),25))
#'  gaiaCNVplot(call,threshold = 0.01)
gaiaCNVplot <- function (calls,  threshold = 0.01) {
    Calls <-
        calls[order(calls[, grep("start", colnames(calls), ignore.case = TRUE)]), ]
    Calls <-
        Calls[order(Calls[, grep("chr", colnames(calls), ignore.case = TRUE)]), ]
    rownames(Calls) <- NULL
    Chromo <- Calls[, grep("chr", colnames(calls), ignore.case = TRUE)]
    Gains <-
        apply(Calls, 1, function(x)
            ifelse(x[grep("aberration", colnames(calls), ignore.case = TRUE)] == 1, x["score"], 0))
    Losses <-
        apply(Calls, 1, function(x)
            ifelse(x[grep("aberration", colnames(calls), ignore.case = TRUE)] == 0, x["score"], 0))
    plot(
        Gains,
        ylim = c(-max(Calls[, "score"] + 2), max(Calls[, "score"] + 2)),
        type = "h",
        col = "red",
        xlab = "Chromosome",
        ylab = "Score",
        xaxt = "n"
    )
    points(-(Losses), type = "h", col = "blue")
    # Draw origin line
    abline(h = 0, cex = 4)
    # Draw threshold lines
    abline(
        h = -log10(threshold),
        col = "orange",
        cex = 4,
        main = "test"
    )
    abline(
        h = log10(threshold),
        col = "orange",
        cex = 4,
        main = "test"
    )

    uni.chr <- unique(Chromo)
    temp <- rep(0, length(uni.chr))
    for (i in 1:length(uni.chr)) {
        temp[i] <- max(which(uni.chr[i] == Chromo))
    }
    for (i in 1:length(temp)) {
        abline(v = temp[i],
               col = "black",
               lty = "dashed")
    }
    nChroms <- length(uni.chr)
    begin <- c()
    for (d in 1:nChroms) {
        chrom <- sum(Chromo == uni.chr[d])
        begin <- append(begin, chrom)
    }
    temp2 <- rep(0, nChroms)
    for (i in 1:nChroms) {
        if (i == 1) {
            temp2[1] <- (begin[1] * 0.5)
        }
        else if (i > 1) {
            temp2[i] <- temp[i - 1] + (begin[i] * 0.5)
        }
    }
    uni.chr[uni.chr == 23] <- "X"
    uni.chr[uni.chr == 24] <- "Y"
    for (i in 1:length(temp)) {
        axis(1,
             at = temp2[i],
             labels = uni.chr[i],
             cex.axis = 1)
    }
    legend(
        x = 1,
        y = max(Calls[, "score"] + 2),
        y.intersp = 0.8,
        c("Amp"),
        pch = 15,
        col = c("red"),
        text.font = 3
    )
    legend(
        x = 1,
        y = -max(Calls[, "score"] + 0.5),
        y.intersp = 0.8,
        c("Del"),
        pch = 15,
        col = c("blue"),
        text.font = 3
    )
}

#' Get a matrix of interactions of genes from biogrid
#' @description
#' Using biogrid database, it will create a matrix of gene interactions.
#' If columns A and row B has value 1, it means the gene A and gene B interacts.
#' @param tmp.biogrid Biogrid table
#' @export
#' @param names.genes List of genes to filter from output. Default: consider all genes
#' @return A matrix with 1 for genes that interacts, 0 for no interaction.
#' @examples
#' names.genes.de <- c("PLCB1","MCL1","PRDX4","TTF2","TACC3", "PARP4","LSM1")
#' tmp.biogrid <- data.frame("Official.Symbol.Interactor.A" = names.genes.de,
#'                           "Official.Symbol.Interactor.B" = rev(names.genes.de))
#' net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de)
#' \dontrun{
#'   file <- paste0("http://thebiogrid.org/downloads/archives/",
#'                  "Release%20Archive/BIOGRID-3.4.133/BIOGRID-ALL-3.4.133.tab2.zip")
#'   downloader::download(file,basename(file))
#'   unzip(basename(file),junkpaths =TRUE)
#'   tmp.biogrid <- read.csv(gsub("zip","txt",basename(file)),
#'                           header=TRUE, sep="\t", stringsAsFactors=FALSE)
#'   names.genes.de <- c("PLCB1","MCL1","PRDX4","TTF2","TACC3", "PARP4","LSM1")
#'   net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de)
#' }
getAdjacencyBiogrid <- function(tmp.biogrid, names.genes = NULL) {
    it.a <- grep("Symbol", colnames(tmp.biogrid), value = TRUE)[1]
    it.b <- grep("Symbol", colnames(tmp.biogrid), value = TRUE)[2]

    if (is.null(names.genes)) {
        names.genes <-
            sort(union(unique(tmp.biogrid[, it.a]), unique(tmp.biogrid[, it.b])))
        ind <- seq(1, nrow(tmp.biogrid))
    } else {
        ind.A <- which(tmp.biogrid[, it.a] %in% names.genes)
        ind.B <- which(tmp.biogrid[, it.b] %in% names.genes)
        ind <- intersect(ind.A, ind.B)
    }

    mat.biogrid <- matrix(
        0,
        nrow = length(names.genes),
        ncol = length(names.genes),
        dimnames = list(names.genes, names.genes)
    )

    for (i in ind) {
        mat.biogrid[tmp.biogrid[i, it.a], tmp.biogrid[i, it.b]] <-
            mat.biogrid[tmp.biogrid[i, it.b], tmp.biogrid[i, it.a]] <- 1
    }
    diag(mat.biogrid) <- 0

    return(mat.biogrid)
}

#' Get GDC samples with both DNA methylation (HM450K) and Gene expression data from
#' GDC databse
#' @description
#' For a given TCGA project it gets the  samples (barcode) with both DNA methylation and Gene expression data
#' from GDC database
#' @param project A GDC project
#' @param n Number of samples to return. If NULL return all (default)
#' @param legacy Access legacy (hg19) or harmonized database (hg38).
#' @return A vector of barcodes
#' @export
#' @examples
#' # Get ACC samples with both  DNA methylation (HM450K) and gene expression aligned to hg19
#' samples <- matchedMetExp("TCGA-UCS", legacy = TRUE)
matchedMetExp <- function(project, legacy = FALSE, n = NULL) {
    if (legacy) {
        # get primary solid tumor samples: DNA methylation
        message("Download DNA methylation information")
        met450k <- GDCquery(
            project = project,
            data.category = "DNA methylation",
            platform = "Illumina Human Methylation 450",
            legacy = TRUE,
            sample.type = c("Primary Tumor")
        )

        # get primary solid tumor samples: RNAseq
        message("Download gene expression information")
        exp <- GDCquery(
            project = project,
            data.category = "Gene expression",
            data.type = "Gene expression quantification",
            platform = "Illumina HiSeq",
            file.type  = "results",
            sample.type = c("Primary Tumor"),
            legacy = TRUE
        )
    } else {
        # get primary solid tumor samples: DNA methylation
        message("Download DNA methylation information")
        met450k <- GDCquery(
            project = project,
            data.category = "DNA Methylation",
            platform = "Illumina Human Methylation 450",
            sample.type = c("Primary Tumor")
        )

        # get primary solid tumor samples: RNAseq
        message("Download gene expression information")
        exp <- GDCquery(
            project = project,
            data.category = "Transcriptome Profiling",
            data.type = "Gene Expression Quantification",
            workflow.type = "HTSeq - Counts"
        )


    }
    met450k.tp <-  met450k$results[[1]]$cases
    # Get patients with samples in both platforms
    exp.tp <-  exp$results[[1]]$cases
    patients <-
        unique(substr(exp.tp, 1, 15)[substr(exp.tp, 1, 12) %in% substr(met450k.tp, 1, 12)])
    if (!is.null(n))
        patients <- patients[1:n] # get only n samples
    return(patients)
}


#' @title Generate Stemness Score based on RNASeq (mRNAsi stemness index) Malta et al., Cell, 2018
#' @description TCGAanalyze_Stemness generate the mRNAsi score
#' @param stemSig is a vector of the stemness Signature generated using gelnet package
#' @param dataGE is a matrix of Gene expression (genes in rows, samples in cols) from TCGAprepare
#' @param annotation as default is FALSE.
#' If annotation == subtype it returns the molecular subtype of a sample.
#' If annotation == sampleType it returns the type of a sample (normal or tumor)
#' @export
#' @return table with samples and stemness score
#' @examples
#'  # Selecting TCGA breast cancer (10 samples) for example stored in dataBRCA
#'  dataNorm <- TCGAanalyze_Normalization(tabDF = dataBRCA, geneInfo =  geneInfo)
#'  # quantile filter of genes
#'  dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
#'                                   method = "quantile",
#'                                   qnt.cut =  0.25)
#'  dataBRCA_stemness <- TCGAanalyze_Stemness(stemSig = PCBC_stemSig,
#'  dataGE = dataFilt, annotation = "sampleType")
TCGAanalyze_Stemness <- function(stemSig,
                                 dataGE,
                                 annotation = FALSE) {
    reads <- dataGE
    X <- reads
    w <- stemSig
    commonStemsigGenes <- intersect(names(w), rownames(X))

    X <- X[commonStemsigGenes, ]
    w <- w[rownames(X)]

    # Score the Matrix X using Spearman correlation.

    s <-
        apply(X, 2, function(z) {
            cor(z, w, method = "sp", use = "complete.obs")
        })

    ## Scale the scores to be between 0 and 1
    s <- s - min(s)
    s <- s / max(s)

    dataSce_stemness <- cbind(s)

    dataAnnotationSC <- matrix(0, ncol(reads), 2)
    colnames(dataAnnotationSC) <- c("Sample", "Annotation")


    dataAnnotationSC <- as.data.frame(dataAnnotationSC)
    dataAnnotationSC$Sample <- colnames(reads)
    rownames(dataAnnotationSC) <- colnames(reads)

    dataAnnotationSC <-
        cbind(dataAnnotationSC, StemnessScore = rep(0, nrow(dataAnnotationSC)))
    dataAnnotationSC[rownames(dataSce_stemness), "StemnessScore"] <-
        as.numeric(dataSce_stemness)

    colnames(dataAnnotationSC)[1] <- "Sample"
    if (annotation == "sampleType") {
        sampleTP <-
            TCGAquery_SampleTypes(barcode = dataAnnotationSC$Sample, typesample = "TP")
        sampleNT <-
            TCGAquery_SampleTypes(barcode = dataAnnotationSC$Sample, typesample = "NT")

        dataAnnotationSC[sampleTP, "Annotation"] <- "TP"
        dataAnnotationSC[sampleNT, "Annotation"] <- "NT"
    }

    if (annotation == "subtype") {
        dataSubt <- TCGAquery_subtype(tumor = "BRCA")

        for (i in 1:nrow(dataAnnotationSC)) {
            curSample <- dataAnnotationSC$Sample[i]
            dataAnnotationSC$Annotation <-
                dataSubt[dataSubt$patient %in% substr(curSample, 1, 12), "BRCA_Subtype_PAM50"]
        }
    }

    return(dataAnnotationSC)
}

Try the TCGAbiolinks package in your browser

Any scripts or data that you put into this service are public.

TCGAbiolinks documentation built on Nov. 8, 2020, 5:37 p.m.