R/analyze.R

Defines functions matchedMetExp getAdjacencyBiogrid TCGAanalyze_networkInference TCGAanalyze_Pathview TCGAanalyze_analyseGRN TCGAanalyze_DEA_Affy TCGAanalyze_EAcomplete TCGAanalyze_LevelTab UseRaw_afterFilter TCGAanalyze_Normalization TCGAanalyze_Filtering TCGAanalyze_SurvivalKM TCGAanalyze_Preprocessing TCGAanalyze_Clustering

Documented in getAdjacencyBiogrid matchedMetExp TCGAanalyze_analyseGRN TCGAanalyze_Clustering TCGAanalyze_DEA_Affy TCGAanalyze_EAcomplete TCGAanalyze_Filtering TCGAanalyze_LevelTab TCGAanalyze_networkInference TCGAanalyze_Normalization TCGAanalyze_Pathview TCGAanalyze_Preprocessing TCGAanalyze_SurvivalKM UseRaw_afterFilter

#' @title Hierarchical cluster analysis
#' @description Hierarchical cluster analysis using several methods such as
#'  ward.D", "ward.D2", "single", "complete", "average" (= UPGMA),
#'  "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' @param tabDF is a dataframe or numeric matrix, each row represents a gene,
#' each column represents a sample come from TCGAPrepare.
#' @param method is method to be used for generic cluster such as 'hclust'
#' or 'consensus'
#' @param methodHC is method to be used for Hierarchical cluster.
#' @import stats
#' @export
#' @return object of class hclust if method selected is 'hclust'.
#' If method selected is 'Consensus' returns a list of length maxK
#' (maximum cluster number to evaluate.). Each element is a list containing
#' consensusMatrix (numerical matrix), consensusTree (hclust), consensusClass
#' (consensus class assignments). ConsensusClusterPlus also produces images.
TCGAanalyze_Clustering <- function(
        tabDF,
        method,
        methodHC = "ward.D2"
) {

    if (!requireNamespace("ConsensusClusterPlus", quietly = TRUE)) {
        stop("ConsensusClusterPlus is needed. Please install it.", call. = FALSE)
    }

    if (method == "hclust") {
        ans <- hclust(ddist <- dist(tabDF), method = methodHC)
    }

    if (method == "consensus") {
        sHc <- hclust(ddist <- dist(tabDF), method = methodHC)
        ans <- ConsensusClusterPlus::ConsensusClusterPlus(
            ddist,
            maxK = 7,
            pItem = 0.9,
            reps = 1000,
            title = "mc_consensus_k7_1000",
            clusterAlg = "hc",
            innerLinkage = "ward.D2",
            finalLinkage = "complete",
            plot = 'pdf',
            writeTable = TRUE
        )
    }

    return(ans)
}


#' @title Array Array Intensity correlation (AAIC) and correlation boxplot to define outlier
#' @description TCGAanalyze_Preprocessing perform Array Array Intensity correlation (AAIC).
#' It defines a square symmetric matrix of spearman correlation among samples.
#' According this matrix and boxplot of correlation samples by samples it is possible
#' to find samples with low correlation that can be identified as possible outliers.
#' @param object gene expression of class RangedSummarizedExperiment from TCGAprepare
#' @param cor.cut is a threshold to filter samples according their spearman correlation in
#' samples by samples. default cor.cut is 0
#' @param filename Filename of the image file
#' @param width Image width
#' @param height Image height
#' @param datatype is a string from RangedSummarizedExperiment assay
#' @importFrom grDevices dev.list
#' @importFrom SummarizedExperiment assays
#' @export
#' @return Plot with array array intensity correlation and boxplot of correlation samples by samples
TCGAanalyze_Preprocessing <- function(
        object,
        cor.cut = 0,
        datatype = names(assays(object))[1],
        filename = NULL,
        width = 1000,
        height = 1000
) {
    # This is a work around for raw_counts and raw_count
    if(missing(object))
        stop("Please set object argument")

    if(!is(object,"RangedSummarizedExperiment"))
        stop("Input object should be a RangedSummarizedExperiment")

    if (grepl("raw_counts", datatype) & any(grepl("raw_counts", names(assays(object)))))
        datatype <- names(assays(object))[grepl("raw_counts", names(assays(object)))]

    if (!any(grepl(datatype, names(assays(object)))))
        stop(
            paste0(
                datatype,
                " not found in the assay list: ",
                paste(names(assays(object)), collapse = ", "),
                "\n  Please set the correct datatype argument."
            )
        )


    if (!(is.null(dev.list()["RStudioGD"]))) {
        dev.off()
    }
    if (is.null(filename)) filename <- "PreprocessingOutput.png"

    png(filename, width = width, height = height)
    par(oma = c(10, 10, 10, 10))
    ArrayIndex <-  as.character(1:length(colData(object)$barcode))

    pmat_new <- matrix(0, length(ArrayIndex), 4)
    colnames(pmat_new) <- c("Disease", "platform", "SampleID", "Study")
    rownames(pmat_new) <- as.character(colData(object)$barcode)
    pmat_new <- as.data.frame(pmat_new)
    pmat_new$Disease <- as.character(colData(object)$definition)
    pmat_new$platform <- "platform"
    pmat_new$SampleID <- as.character(colData(object)$barcode)
    pmat_new$Study <- "study"

    tabGroupCol <- cbind(pmat_new, Color = matrix(0, nrow(pmat_new), 1))

    for (i in seq_along(unique(tabGroupCol$Disease))) {
        tabGroupCol[which(tabGroupCol$Disease == tabGroupCol$Disease[i]), "Color"] <-
            rainbow(length(unique(tabGroupCol$Disease)))[i]
    }
    pmat <- pmat_new
    phenodepth <- min(ncol(pmat), 3)
    order <- switch(
        phenodepth + 1,
        ArrayIndex,
        order(pmat[, 1]),
        order(pmat[, 1], pmat[, 2]),
        order(pmat[, 1], pmat[, 2], pmat[, 3])
    )
    arraypos <-  (1:length(ArrayIndex)) * (1 / (length(ArrayIndex) - 1)) - (1 / (length(ArrayIndex) - 1))
    arraypos2 = seq(1:length(ArrayIndex) - 1)

    for (i in 2:length(ArrayIndex)) {
        arraypos2[i - 1] <- (arraypos[i] + arraypos[i - 1]) / 2
    }
    layout(matrix(c(1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 3, 3, 3, 4), 4, 4, byrow = TRUE))

    c <- cor(assay(object, datatype)[, order], method = "spearman")

    image(c,
          xaxt = "n",
          yaxt = "n",
          #xlab = "Array Samples",
          #ylab = "Array Samples",
          main = "Array-Array Intensity Correlation after RMA"
    )

    for (i in 1:length(names(table(tabGroupCol$Color)))) {
        currentCol <- names(table(tabGroupCol$Color))[i]
        pos.col <- arraypos[which(tabGroupCol$Color == currentCol)]
        lab.col <- colnames(c)[which(tabGroupCol$Color == currentCol)]
        #axis(1, labels = lab.col , at = pos.col, col = currentCol,lwd = 6,las = 2)
        axis(
            2,
            labels = lab.col ,
            at = pos.col,
            col = currentCol,
            lwd = 6,
            las = 2
        )
    }

    m <- matrix(pretty(c, 10), nrow = 1, ncol = length(pretty(c, 10)))

    image(
        m,
        xaxt = "n",
        yaxt = "n",
        ylab = "Correlation Coefficient"
    )

    axis(2,
         labels = as.list(pretty(c, 10)),
         at = seq(0, 1, by = (1 / (
             length(pretty(c,  10)) - 1
         ))))

    abline(
        h = seq(
            (1 / ( length(pretty(c, 10)) - 1)) / 2,
            1 - (1 / ( length(pretty(c, 10)) - 1)),
            by = (1 / ( length(pretty(c, 10)) - 1))
        )
    )
    box()
    boxplot(
        c,
        outline = FALSE,
        las = 2,
        lwd = 6,
        # names = NULL,
        col = tabGroupCol$Color,
        main = "Boxplot of correlation samples by samples after normalization"
    )
    dev.off()

    samplesCor <- rowMeans(c)

    message("Number of outliers: ", sum(samplesCor < cor.cut))
    objectWO <-  assay(object, datatype)[, names(samplesCor)[samplesCor > cor.cut]]

    return(objectWO)
}

#' @title survival analysis (SA) univariate with Kaplan-Meier (KM) method.
#' @description TCGAanalyze_SurvivalKM perform an univariate Kaplan-Meier (KM) survival analysis (SA).
#' It performed Kaplan-Meier survival univariate using complete follow up with all days
#' taking one gene a time from Genelist of gene symbols.
#' For each gene according its level of mean expression in cancer samples,
#' defining two thresholds for quantile
#' expression of that gene in all samples (default ThreshTop=0.67,ThreshDown=0.33) it is possible
#' to define a threshold of intensity of gene expression to divide the samples in 3 groups
#' (High, intermediate, low).
#' TCGAanalyze_SurvivalKM performs SA between High and low groups using following functions
#' from survival package
#' \enumerate{
#' \item survival::Surv
#' \item survival::survdiff
#' \item survival::survfit
#' }
#' @param clinical_patient is a data.frame using function 'clinic' with information
#' related to barcode / samples such as bcr_patient_barcode, days_to_death ,
#' days_to_last_follow_up , vital_status, etc
#' @param dataGE is a matrix of Gene expression (genes in rows, samples in cols) from TCGAprepare
#' @param Genelist is a list of gene symbols where perform survival KM.
#' @param Survresult is a parameter (default = FALSE) if is TRUE will show KM plot and results.
#' @param ThreshTop is a quantile threshold to identify samples with high expression of a gene
#' @param ThreshDown is a quantile threshold to identify samples with low expression of a gene
#' @param p.cut p.values threshold. Default: 0.05
#' @param group1 a string containing the barcode list of the samples in in control group
#' @param group2 a string containing the barcode list of the samples in in disease group
#' @export
#' @return table with survival genes pvalues from KM.
#' @examples
#'  # Selecting only 20 genes for example
#'  dataBRCAcomplete <- log2(dataBRCA[1:20,] + 1)
#'
#'  # clinical_patient_Cancer <- GDCquery_clinic("TCGA-BRCA","clinical")
#'  clinical_patient_Cancer <- data.frame(
#'       bcr_patient_barcode = substr(colnames(dataBRCAcomplete),1,12),
#'       vital_status = c(rep("alive",3),"dead",rep("alive",2),rep(c("dead","alive"),2)),
#'       days_to_death = c(NA,NA,NA,172,NA,NA,3472,NA,786,NA),
#'       days_to_last_follow_up = c(3011,965,718,NA,1914,423,NA,5,656,1417)
#'  )
#'
#'  group1 <- TCGAquery_SampleTypes(colnames(dataBRCAcomplete), typesample = c("NT"))
#'  group2 <- TCGAquery_SampleTypes(colnames(dataBRCAcomplete), typesample = c("TP"))
#'
#'  tabSurvKM <- TCGAanalyze_SurvivalKM(
#'     clinical_patient = clinical_patient_Cancer,
#'     dataGE = dataBRCAcomplete,
#'     Genelist = rownames(dataBRCAcomplete),
#'     Survresult = FALSE,
#'     p.cut = 0.4,
#'     ThreshTop = 0.67,
#'     ThreshDown = 0.33,
#'     group1 = group1, # Control group
#'     group2 = group2
#'   ) # Disease group
#'
#'  # If the groups are not specified group1 == group2 and all samples are used
#'  \dontrun{
#'  tabSurvKM <- TCGAanalyze_SurvivalKM(
#'    clinical_patient_Cancer,
#'    dataBRCAcomplete,
#'    Genelist = rownames(dataBRCAcomplete),
#'    Survresult = TRUE,
#'    p.cut = 0.2,
#'     ThreshTop = 0.67,
#'     ThreshDown = 0.33
#'   )
#' }
TCGAanalyze_SurvivalKM <- function(
        clinical_patient,
        dataGE,
        Genelist,
        Survresult = FALSE,
        ThreshTop = 0.67,
        ThreshDown = 0.33,
        p.cut = 0.05,
        group1,
        group2
) {

    check_package("survival")
    # Check which genes we really have in the matrix
    Genelist <- intersect(rownames(dataGE), Genelist)

    # Split gene expression matrix btw the groups
    dataCancer <- dataGE[Genelist, group2, drop = FALSE]
    dataNormal <- dataGE[Genelist, group1, drop = FALSE]
    colnames(dataCancer)  <- substr(colnames(dataCancer), 1, 12)

    cfu <- clinical_patient[clinical_patient[, "bcr_patient_barcode"] %in% substr(colnames(dataCancer), 1, 12), ]

    if ("days_to_last_followup" %in% colnames(cfu))
        colnames(cfu)[grep("days_to_last_followup", colnames(cfu))] <- "days_to_last_follow_up"

    cfu <-  as.data.frame(
        subset(
            cfu,
            select = c(
                "bcr_patient_barcode",
                "days_to_death",
                "days_to_last_follow_up",
                "vital_status"
            )
        ))

    # Set alive death to inf
    if (length(grep("alive", cfu$vital_status, ignore.case = TRUE)) > 0)
        cfu[grep("alive", cfu$vital_status, ignore.case = TRUE), "days_to_death"] <- "-Inf"

    # Set dead follow up to inf
    if (length(grep("dead", cfu$vital_status, ignore.case = TRUE)) > 0)
        cfu[grep("dead", cfu$vital_status, ignore.case = TRUE), "days_to_last_follow_up"] <- "-Inf"

    cfu <- cfu[!(is.na(cfu[, "days_to_last_follow_up"])), ]
    cfu <- cfu[!(is.na(cfu[, "days_to_death"])), ]

    followUpLevel <- FALSE

    cfu$days_to_death <- as.numeric(as.character(cfu$days_to_death))
    cfu$days_to_last_follow_up <- as.numeric(as.character(cfu$days_to_last_follow_up))
    rownames(cfu) <- cfu[, "bcr_patient_barcode"] #mod1

    cfu <- cfu[!(is.na(cfu[, "days_to_last_follow_up"])), ]
    cfu <- cfu[!(is.na(cfu[, "days_to_death"])), ]

    cfu_complete <- cfu
    ngenes <- nrow(as.matrix(rownames(dataNormal)))

    # Evaluate each gene
    tabSurv_Matrix <- plyr::adply(
        .data = 1:length(rownames(dataNormal)),
        .margins = 1,
        .fun = function(i){

            mRNAselected <- as.matrix(rownames(dataNormal))[i]
            mRNAselected_values <-  dataCancer %>% dplyr::filter(rownames(dataCancer) == mRNAselected) %>% as.numeric
            mRNAselected_values_normal <- dataNormal %>% dplyr::filter(rownames(dataNormal) == mRNAselected) %>% as.numeric
            if (all(mRNAselected_values == 0))  return(NULL) # All genes are 0
            tabSurv_Matrix <- data.frame("mRNA" = mRNAselected)

            # Get Thresh values for cancer expression
            mRNAselected_values_ordered <- sort(mRNAselected_values, decreasing = TRUE)
            mRNAselected_values_ordered_top <- as.numeric(quantile(as.numeric(mRNAselected_values_ordered), ThreshTop)[1])
            mRNAselected_values_ordered_down <- as.numeric(quantile(as.numeric(mRNAselected_values_ordered), ThreshDown)[1])

            mRNAselected_values_newvector <- mRNAselected_values

            if (!is.na(mRNAselected_values_ordered_top)) {
                # How many samples do we have
                numberOfSamples <- length(mRNAselected_values_ordered)

                # High group (above ThreshTop)
                lastelementTOP <- max(which(
                    mRNAselected_values_ordered > mRNAselected_values_ordered_top
                ))

                # Low group (below ThreshDown)
                firstelementDOWN <-  min(
                    which(
                        mRNAselected_values_ordered <= mRNAselected_values_ordered_down
                    )
                )

                samples_top_mRNA_selected <- names(mRNAselected_values_ordered[1:lastelementTOP])
                samples_down_mRNA_selected <- names(mRNAselected_values_ordered[firstelementDOWN:numberOfSamples])

                # Which samples are in the intermediate group (above ThreshLow and below ThreshTop)
                samples_UNCHANGED_mRNA_selected <- names(mRNAselected_values_newvector[
                    which((mRNAselected_values_newvector) > mRNAselected_values_ordered_down &  mRNAselected_values_newvector < mRNAselected_values_ordered_top
                    )])

                cfu_onlyTOP <- cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_top_mRNA_selected, ]
                cfu_onlyDOWN <- cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_down_mRNA_selected, ]
                cfu_onlyUNCHANGED <- cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_UNCHANGED_mRNA_selected, ]

                cfu_ordered <- NULL
                cfu_ordered <- rbind(cfu_onlyTOP, cfu_onlyDOWN)
                cfu <- cfu_ordered

                ttime <- as.numeric(cfu[, "days_to_death"])

                sum(status <- ttime > 0) # morti
                deads_complete <- sum(status <- ttime > 0)

                ttime_only_top <- cfu_onlyTOP[, "days_to_death"]
                deads_top <- sum(ttime_only_top > 0)

                if (dim(cfu_onlyDOWN)[1] >= 1) {
                    ttime_only_down <- cfu_onlyDOWN[, "days_to_death"]
                    deads_down <- sum(ttime_only_down > 0)
                } else {
                    deads_down <- 0
                }

                tabSurv_Matrix[1, "Cancer Deaths"] <- deads_complete
                tabSurv_Matrix[1, "Cancer Deaths with Top"] <- deads_top
                tabSurv_Matrix[1, "Cancer Deaths with Down"] <- deads_down
                tabSurv_Matrix[1, "Mean Normal"] <- mean(as.numeric(mRNAselected_values_normal))
                dataCancer_onlyTop_sample <- dataCancer[, samples_top_mRNA_selected, drop = FALSE]
                dataCancer_onlyTop_sample_mRNASelected <- dataCancer_onlyTop_sample[rownames(dataCancer_onlyTop_sample) == mRNAselected, ]
                dataCancer_onlyDown_sample <- dataCancer[, samples_down_mRNA_selected, drop = FALSE]
                dataCancer_onlyDown_sample_mRNASelected <- dataCancer_onlyDown_sample[rownames(dataCancer_onlyDown_sample) == mRNAselected, ]
                tabSurv_Matrix[1, "Mean Tumor Top"] <- mean(as.numeric(dataCancer_onlyTop_sample_mRNASelected))
                tabSurv_Matrix[1, "Mean Tumor Down"] <- mean(as.numeric(dataCancer_onlyDown_sample_mRNASelected))

                ttime[!status] <- as.numeric(cfu[!status, "days_to_last_follow_up"])
                ttime[which(ttime == -Inf)] <- 0

                ttime <- survival::Surv(ttime, status)
                rownames(ttime) <- rownames(cfu)
                legendHigh <- paste(mRNAselected, "High")
                legendLow  <- paste(mRNAselected, "Low")

                tabSurv_pvalue <- tryCatch({
                    tabSurv <- survival::survdiff(ttime  ~ c(rep(
                        "top", nrow(cfu_onlyTOP)
                    ), rep(
                        "down", nrow(cfu_onlyDOWN)
                    )))
                    tabSurv_chis <- unlist(tabSurv)$chisq
                    tabSurv_pvalue <-
                        as.numeric(1 - pchisq(abs(tabSurv$chisq), df = 1))
                }, error = function(e) {
                    return(Inf)
                })
                tabSurv_Matrix[1, "pvalue"] <- tabSurv_pvalue

                if (Survresult == TRUE) {
                    titlePlot <- paste("Kaplan-Meier Survival analysis, pvalue = ",tabSurv_pvalue)
                    plot(
                        survival::survfit(ttime ~ c(
                            rep("low", nrow(cfu_onlyTOP)), rep("high", nrow(cfu_onlyDOWN))
                        )),
                        col = c("green", "red"),
                        main = titlePlot,
                        xlab = "Days",
                        ylab = "Survival"
                    )
                    legend(
                        100,
                        1,
                        legend = c(legendLow, legendHigh),
                        col = c("green", "red"),
                        text.col = c("green", "red"),
                        pch = 15
                    )
                    print(tabSurv)
                }
            } #end if
            tabSurv_Matrix
        },.progress = "time") #end for

    tabSurv_Matrix[tabSurv_Matrix == "-Inf"] <- 0

    tabSurvKM <- tabSurv_Matrix

    # Filtering by selected pvalue < 0.01
    tabSurvKM <- tabSurvKM[tabSurvKM$mRNA != 0, ]
    tabSurvKM <- tabSurvKM[tabSurvKM$pvalue < p.cut, ]
    tabSurvKM <- tabSurvKM[!duplicated(tabSurvKM$mRNA), ]
    rownames(tabSurvKM) <- tabSurvKM$mRNA
    tabSurvKM <- tabSurvKM[, -1]
    tabSurvKM <- tabSurvKM[order(tabSurvKM$pvalue, decreasing = FALSE), ]

    colnames(tabSurvKM) <- gsub("Cancer", "Group2", colnames(tabSurvKM))
    colnames(tabSurvKM) <- gsub("Tumor", "Group2", colnames(tabSurvKM))
    colnames(tabSurvKM) <- gsub("Normal", "Group1", colnames(tabSurvKM))

    return(tabSurvKM)
}


#' @title Filtering mRNA transcripts and miRNA selecting a threshold.
#' @description
#'    TCGAanalyze_Filtering allows user to filter mRNA transcripts and miRNA,
#'    samples, higher than the threshold defined quantile mean across all samples.
#' @param tabDF is a dataframe or numeric matrix, each row represents a gene,
#' each column represents a sample come from TCGAPrepare
#' @param method is method of filtering such as 'quantile', 'varFilter', 'filter1', 'filter2'
#' @param qnt.cut is threshold selected as mean for filtering
#' @param var.func is function used as the per-feature filtering statistic.
#' See genefilter documentation
#' @param var.cutoff is a numeric value. See genefilter documentation
#' @param eta is a parameter for filter1. default eta = 0.05.
#' @param foldChange is a parameter for filter2. default foldChange = 1.
#' @export
#' @return A filtered dataframe or numeric matrix where each row represents a gene,
#' each column represents a sample
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataNorm <- TCGAanalyze_Normalization(tabDF = dataBRCA,
#' geneInfo = geneInfo,
#' method = "geneLength")
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm, method = "quantile", qnt.cut = 0.25)
TCGAanalyze_Filtering <- function(
        tabDF,
        method,
        qnt.cut = 0.25,
        var.func = IQR,
        var.cutoff = 0.75,
        eta = 0.05,
        foldChange = 1
) {
    if (method == "quantile") {
        GeneThresh <- as.numeric(quantile(rowMeans(tabDF,na.rm = TRUE), qnt.cut))
        geneFiltered <- names(which(rowMeans(tabDF) > GeneThresh))
        tabDF_Filt <- tabDF[geneFiltered,]
    }

    if (method == "varFilter") {
        check_package("genefilter")
        tabDF_Filt <- genefilter::varFilter(
            tabDF,
            var.func = IQR,
            var.cutoff = 0.75,
            filterByQuantile = TRUE
        )
    }

    if (method == "filter1") {
        normCounts <- tabDF
        geData <- t(log(1 + normCounts, 2))
        filter <-
            apply(geData, 2, function(x)
                sum(quantile(x, probs = c(1 - eta, eta)) * c(1, -1)))
        tabDF_Filt <- geData[, which(filter > foldChange)]
    }

    if (method == "filter2") {
        geData <- tabDF
        filter <-
            apply(geData, 2, function(x)
                prod(quantile(x, probs =  c(1 - eta, eta)) - 10) < 0)
        tabDF_Filt <- geData[, which(filter)]
    }


    return(tabDF_Filt)
}

#' @title normalization mRNA transcripts and miRNA using EDASeq package.
#' @description
#'   TCGAanalyze_Normalization allows user to normalize mRNA transcripts and miRNA,
#'    using EDASeq package.
#'
#'    Normalization for RNA-Seq Numerical and graphical
#'     summaries of RNA-Seq read data. Within-lane normalization procedures
#'    to adjust for GC-content effect (or other gene-level effects) on read counts:
#'    loess robust local regression, global-scaling, and full-quantile normalization
#'    (Risso et al., 2011). Between-lane normalization procedures to adjust for
#'    distributional differences between lanes (e.g., sequencing depth):
#'    global-scaling and full-quantile normalization (Bullard et al., 2010).
#'
#'    For istance returns all mRNA or miRNA with mean across all
#'    samples, higher than the threshold defined quantile mean across all samples.
#'
#'    TCGAanalyze_Normalization performs normalization using following functions
#'    from EDASeq
#'    \enumerate{
#'    \item  EDASeq::newSeqExpressionSet
#'    \item  EDASeq::withinLaneNormalization
#'    \item  EDASeq::betweenLaneNormalization
#'    \item  EDASeq::counts
#'    }
#' @param tabDF Rnaseq numeric matrix, each row represents a gene,
#' each column represents a sample
#' @param geneInfo Information matrix of 20531 genes about geneLength and gcContent.
#' Two objects are provided: TCGAbiolinks::geneInfoHT,TCGAbiolinks::geneInfo
#' @param method is method of normalization such as 'gcContent' or 'geneLength'
#' @export
#' @return Rnaseq matrix normalized with counts slot holds the count data as a matrix
#' of non-negative integer count values, one row for each observational unit (gene or the like),
#' and one column for each sample.
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
TCGAanalyze_Normalization <- function(
        tabDF,
        geneInfo,
        method = "geneLength"
) {

    check_package("EDASeq")

    # Check if we have a SE, we need a gene expression matrix
    if (is(tabDF, "SummarizedExperiment")) tabDF <- assay(tabDF)

    geneInfo <- geneInfo[!is.na(geneInfo[, 1]), ]
    geneInfo <- as.data.frame(geneInfo)
    geneInfo$geneLength <- as.numeric(as.character(geneInfo$geneLength))
    geneInfo$gcContent <- as.numeric(as.character(geneInfo$gcContent))


    if(any(grepl("\\|",rownames(tabDF)))){
        tmp <- as.character(rownames(tabDF))
        geneNames <- stringr::str_split(tmp, "\\|",simplify = T)
        tmp <- which(geneNames[, 1] == "?")
        geneNames[tmp, 1] <- geneNames[tmp, 2]
        tmp <- table(geneNames[, 1])
        tmp <- which(geneNames[, 1] %in% names(tmp[which(tmp > 1)]))
        geneNames[tmp, 1] <- paste(geneNames[tmp, 1], geneNames[tmp, 2], sep = ".")
        tmp <- table(geneNames[, 1])
        rownames(tabDF) <- geneNames[, 1]
    } else if(any(grepl("ENSG",rownames(tabDF)))){
        rownames(tabDF) <- gsub("\\.[0-9]*","",rownames(tabDF))
    }


    if (method == "gcContent") {
        rawCounts <- tabDF
        commonGenes <-  intersect(rownames(geneInfo), rownames(rawCounts))

        geneInfo <- geneInfo[commonGenes, ]
        rawCounts <- rawCounts[commonGenes, ]

        timeEstimated <- format(ncol(tabDF) * nrow(tabDF) / 80000, digits = 2)
        message(
            messageEstimation <- paste(
                "I Need about ",
                timeEstimated,
                "seconds for this Complete Normalization Upper Quantile",
                " [Processing 80k elements /s]  "
            )
        )

        ffData  <- as.data.frame(geneInfo)
        rawCounts <- floor(rawCounts)

        message("Step 1 of 4: newSeqExpressionSet ...")
        tmp <- EDASeq::newSeqExpressionSet(as.matrix(rawCounts), featureData = ffData)

        #fData(tmp)[, "gcContent"] <- as.numeric(geneInfo[, "gcContent"])

        message("Step 2 of 4: withinLaneNormalization ...")
        tmp <- EDASeq::withinLaneNormalization(
            tmp, "gcContent",
            which = "upper",
            offset = TRUE
        )

        message("Step 3 of 4: betweenLaneNormalization ...")
        if(any(is.na(EDASeq::normCounts(tmp)))) {
            tmp <- tmp[rowSums(is.na(EDASeq::normCounts(tmp))) == 0,]
        }
        tmp <- EDASeq::betweenLaneNormalization(
            tmp,
            which = "upper",
            offset = TRUE
        )
        normCounts <-  log(rawCounts[rownames(tmp),] + .1) + EDASeq::offst(tmp)
        normCounts <-  floor(exp(normCounts) - .1)

        message("Step 4 of 4: .quantileNormalization ...")
        tmp <- t(.quantileNormalization(t(normCounts)))
        tabDF_norm <- floor(tmp)
    }

    if (method == "geneLength") {
        tabDF <- tabDF[!duplicated(rownames(tabDF)), !duplicated(colnames(tabDF))]
        tabDF <- tabDF[rownames(tabDF) %in% rownames(geneInfo), ]
        #tabDF <- tabDF[rowMeans(tabDF) > 1,]
        tabDF <- tabDF[which(rowSums(tabDF == 0) < ncol(tabDF)),]
        tabDF <- as.matrix(tabDF)

        geneInfo <-  geneInfo[rownames(geneInfo) %in% rownames(tabDF),]
        geneInfo <- geneInfo[!duplicated(rownames(geneInfo)),]
        toKeep <- which(geneInfo[, "geneLength"] != 0)
        geneInfo <- geneInfo[toKeep,]
        tabDF <- tabDF[toKeep,]
        geneInfo <- as.data.frame(geneInfo)
        tabDF <- round(tabDF)
        commonGenes <- intersect(rownames(tabDF), rownames(geneInfo))

        tabDF <- tabDF[commonGenes, ]
        geneInfo <- geneInfo[commonGenes, ]

        timeEstimated <-  format(ncol(tabDF) * nrow(tabDF) / 80000, digits = 2)
        message(
            messageEstimation <- paste(
                "I Need about ",
                timeEstimated,
                "seconds for this Complete Normalization Upper Quantile",
                " [Processing 80k elements /s]  "
            )
        )

        message("Step 1 of 4: newSeqExpressionSet ...")

        tabDF_norm <- EDASeq::newSeqExpressionSet(
            tabDF,
            featureData = geneInfo
        )
        message("Step 2 of 4: withinLaneNormalization ...")

        tabDF_norm <-  EDASeq::withinLaneNormalization(
            tabDF_norm,
            "geneLength",
            which = "upper",
            offset = FALSE
        )

        message("Step 3 of 4: betweenLaneNormalization ...")
        if(any(is.na(EDASeq::normCounts(tabDF_norm)))) {
            tabDF_norm <- tabDF_norm[rowSums(is.na(EDASeq::normCounts(tabDF_norm))) == 0,]
        }

        tabDF_norm <- EDASeq::betweenLaneNormalization(
            tabDF_norm,
            which = "upper",
            offset = FALSE
        )

        message("Step 4 of 4: exprs ...")
        tabDF_norm <- EDASeq::counts(tabDF_norm)
    }

    # In case NA's were produced to all rows
    if(any(rowSums(is.na(tabDF_norm)) == ncol(tabDF_norm))){
        tabDF_norm <- tabDF_norm[rowSums(is.na(tabDF_norm)) != ncol(tabDF_norm),]
    }

    return(tabDF_norm)
}


#' @title Differential expression analysis (DEA) using edgeR or limma package.
#' @description
#'    TCGAanalyze_DEA allows user to perform Differentially expression analysis (DEA),
#'    using edgeR package or limma to identify differentially expressed genes (DEGs).
#'     It is possible to do a two-class analysis.
#'
#'     TCGAanalyze_DEA performs DEA using following functions from edgeR:
#'     \enumerate{
#'     \item edgeR::DGEList converts the count matrix into an edgeR object.
#'     \item edgeR::estimateCommonDisp each gene gets assigned the same dispersion estimate.
#'     \item edgeR::exactTest performs pair-wise tests for differential expression between two groups.
#'     \item edgeR::topTags takes the output from exactTest(), adjusts the raw p-values using the
#'     False Discovery Rate (FDR) correction, and returns the top differentially expressed genes.
#'     }
#'     TCGAanalyze_DEA performs DEA using following functions from limma:
#'     \enumerate{
#'     \item limma::makeContrasts construct matrix of custom contrasts.
#'     \item limma::lmFit Fit linear model for each gene given a series of arrays.
#'     \item limma::contrasts.fit Given a linear model fit to microarray data, compute estimated coefficients and standard errors for a given set of contrasts.
#'     \item limma::eBayes Given a microarray linear model fit, compute moderated t-statistics, moderated F-statistic, and log-odds of differential expression by empirical Bayes moderation of the standard errors towards a common value.
#'     \item limma::toptable Extract a table of the top-ranked genes from a linear model fit.
#'     }
#' @param mat1 numeric matrix, each row represents a gene,
#' each column represents a sample with Cond1type
#' @param mat2 numeric matrix, each row represents a gene,
#' each column represents a sample with Cond2type
#' @param metadata Add metadata
#' @param Cond1type a string containing the class label of the samples in mat1
#'  (e.g., control group)
#' @param Cond2type a string containing the class label of the samples in mat2
#' (e.g., case group)
#' @param pipeline a string to specify which package to use ("limma" or "edgeR")
#' @param method is 'glmLRT' (1) or 'exactTest' (2) used for edgeR
#' (1) Fit a negative binomial generalized log-linear model to
#' the read counts for each gene
#' (2) Compute genewise exact tests for differences in the means between
#' two groups of negative-binomially distributed counts.
#' @param fdr.cut is a threshold to filter DEGs according their p-value corrected
#' @param logFC.cut is a threshold to filter DEGs according their logFC
#' @param batch.factors a vector containing strings to specify options for batch correction. Options are "Plate", "TSS", "Year", "Portion", "Center", and "Patients"
#' @param ClinicalDF a dataframe returned by GDCquery_clinic() to be used to extract year data
#' @param paired boolean to account for paired or non-paired samples. Set to TRUE for paired case
#' @param log.trans boolean to perform log cpm transformation. Set to TRUE for log transformation
#' @param trend boolean to perform limma-trend pipeline. Set to TRUE to go through limma-trend
#' @param MAT matrix containing expression set as all samples in columns and genes as rows. Do not provide if mat1 and mat2 are used
#' @param contrast.formula string input to determine coefficients and to design contrasts in a customized way
#' @param Condtypes vector of grouping for samples in MAT
#' @param voom boolean to perform voom transformation for limma-voom pipeline. Set to TRUE for voom transformation
#' @export
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut =  0.25)
#' samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
#' samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
#' dataDEGs <- TCGAanalyze_DEA(
#'    mat1 = dataFilt[,samplesNT],
#'    mat2 = dataFilt[,samplesTP],
#'    Cond1type = "Normal",
#'    Cond2type = "Tumor"
#' )
#'
#' @return table with DEGs containing for each gene logFC, logCPM, pValue,and FDR, also for each contrast
TCGAanalyze_DEA <- function (
        mat1,
        mat2,
        metadata = TRUE,
        Cond1type,
        Cond2type,
        pipeline = "edgeR",
        method = "exactTest",
        fdr.cut = 1,
        logFC.cut = 0,
        batch.factors = NULL,
        ClinicalDF = data.frame(),
        paired = FALSE,
        log.trans = FALSE,
        voom = FALSE,
        trend = FALSE,
        MAT = data.frame(),
        contrast.formula = "",
        Condtypes = c()
){

    if(pipeline == "limma")  check_package("limma")
    if(pipeline == "edgeR")  check_package("edgeR")

    table.code <- c(
        "TP",
        "TR",
        "TB",
        "TRBM",
        "TAP",
        "TM",
        "TAM",
        "THOC",
        "TBM",
        "NB",
        "NT",
        "NBC",
        "NEBV",
        "NBM",
        "CELLC",
        "TRB",
        "CELL",
        "XP",
        "XCL"
    )
    names(table.code) <- c(
        "01",
        "02",
        "03",
        "04",
        "05",
        "06",
        "07",
        "08",
        "09",
        "10",
        "11",
        "12",
        "13",
        "14",
        "20",
        "40",
        "50",
        "60",
        "61"
    )
    if (nrow(MAT) == 0) {
        TOC <- cbind(mat1, mat2)
        Cond1num <- ncol(mat1)
        Cond2num <- ncol(mat2)
    }  else {
        TOC <- MAT
    }

    if (metadata == TRUE & all(grepl("TCGA",colnames(TOC)))){
        my_IDs <- get_IDs(TOC)
        Plate <- factor(my_IDs$plate)
        Condition <- factor(my_IDs$condition)
        TSS <- factor(my_IDs$tss)
        Portion <- factor(my_IDs$portion)
        Center <- factor(my_IDs$center)
        Patients <- factor(my_IDs$patient)
    }


    # this makes non-sense for non-TCGA data
    if (paired == TRUE) {
        matched.query <- TCGAquery_MatchedCoupledSampleTypes(
            my_IDs$barcode,
            table.code[unique(my_IDs$sample)]
        )
        my_IDs <- subset(my_IDs, barcode == matched.query)
        TOC <- TOC[, (names(TOC) %in% matched.query)]
    }

    if (nrow(ClinicalDF) > 0) {
        names(ClinicalDF)[names(ClinicalDF) == "bcr_patient_barcode"] <- "patient"
        ClinicalDF$age_at_diag_year <-  floor(clinical$age_at_diagnosis / 365)
        ClinicalDF$diag_year <- ClinicalDF$age_at_diag_year + clinical$year_of_birth
        diag_yearDF <- ClinicalDF[, c("patient", "diag_year")]
        my_IDs <- merge(my_IDs, ClinicalDF, by = "patient")
        Year <- as.factor(my_IDs$diag_year)
    }

    options <- c("Plate", "TSS", "Year", "Portion", "Center",  "Patients")
    if (length(batch.factors) == 0) {
        message("Batch correction skipped since no factors provided")
    } else {
        for (o in batch.factors) {
            if (o %in% options == FALSE)
                stop(paste0(o, " is not a valid batch correction factor"))
            if (o == "Year" & nrow(ClinicalDF) == 0)
                stop(
                    "batch correction using diagnosis year needs clinical info. Provide Clinical Data in arguments"
                )
        }
    }
    additiveformula <- paste(batch.factors, collapse = "+")

    message("----------------------- DEA -------------------------------")
    if (nrow(MAT) == 0) {
        message("o ",Cond1num," samples in Cond1type ",Cond1type)
        message("o ",Cond2num," samples in Cond2type ",Cond2type)
        message("o ", nrow(TOC), " features as miRNA or genes ")
    } else {
        message("o ", nrow(TOC), " features as miRNA or genes ")
    }
    message("This may take some minutes...")

    colnames(TOC) <- paste0("s", 1:ncol(TOC))

    if (length(Condtypes) > 0) {
        tumorType <- factor(x = Condtypes, levels = unique(Condtypes))
    } else {
        tumorType <- factor(
            x = rep(c(Cond1type, Cond2type), c(Cond1num, Cond2num)),
            levels = c(Cond1type, Cond2type)
        )
    }

    if (length(batch.factors) == 0 & length(Condtypes) > 0) {
        if (pipeline == "edgeR")
            design <- model.matrix( ~ tumorType)
        else
            design <- model.matrix( ~ 0 + tumorType)
    } else if (length(batch.factors) == 0 & length(Condtypes) == 0) {
        if (pipeline == "edgeR")
            design <- model.matrix( ~ tumorType)
        else
            design <- model.matrix( ~ 0 + tumorType)
    } else if (length(batch.factors) > 0 & length(Condtypes) == 0) {
        if (pipeline == "edgeR")
            formula <- paste0("~tumorType+", additiveformula)
        else
            formula <- paste0("~0+tumorType+", additiveformula)
        design <- model.matrix(eval(parse(text = formula)))
    } else if (length(batch.factors) > 0 & length(Condtypes) > 0) {
        if (pipeline == "edgeR") {
            formula <- paste0("~tumorType+", additiveformula)
            if (length(Condtypes) > 2)
                formula <- paste0("~0+tumorType+", additiveformula)
        } else {
            formula <- paste0("~0+tumorType+", additiveformula)
        }
        design <- model.matrix(eval(parse(text = formula)))
    }

    if (pipeline == "edgeR") {
        if (method == "exactTest") {
            DGE <- edgeR::DGEList(TOC, group = rep(c(Cond1type, Cond2type), c(Cond1num, Cond2num)))
            disp <- edgeR::estimateCommonDisp(DGE)
            tested <- edgeR::exactTest(disp, pair = c(Cond1type,  Cond2type))
            logFC_table <- tested$table
            tableDEA <- edgeR::topTags(tested, n = nrow(tested$table))$table
            tableDEA <- tableDEA[tableDEA$FDR <= fdr.cut,]
            tableDEA <- tableDEA[abs(tableDEA$logFC) >= logFC.cut,]
        }
        else if (method == "glmLRT") {
            if (length(unique(tumorType)) == 2) {
                aDGEList <- edgeR::DGEList(counts = TOC, group = tumorType)
                aDGEList <- edgeR::estimateGLMCommonDisp(aDGEList, design)
                aDGEList <- edgeR::estimateGLMTagwiseDisp(aDGEList, design)
                aGlmFit <- edgeR::glmFit(
                    aDGEList,
                    design,
                    dispersion = aDGEList$tagwise.dispersion,
                    prior.count.total = 0
                )

                aGlmLRT <- edgeR::glmLRT(aGlmFit, coef = 2)
                tableDEA <-  cbind(aGlmLRT$table, FDR = p.adjust(aGlmLRT$table$PValue, "fdr"))
                tableDEA <- tableDEA[tableDEA$FDR < fdr.cut, ]
                tableDEA <- tableDEA[abs(tableDEA$logFC) > logFC.cut, ]

                if (all(grepl("ENSG", rownames(tableDEA)))){
                    tableDEA <- cbind(tableDEA, map.ensg(genes = rownames(tableDEA))[, c("gene_name","gene_type")])
                }
            } else if (length(unique(tumorType)) > 2) {
                aDGEList <- edgeR::DGEList(counts = TOC, group = tumorType)
                colnames(design)[1:length(levels(tumorType))] <- levels(tumorType)
                prestr = "makeContrasts("
                poststr = ",levels=design)"
                commandstr = paste(prestr, contrast.formula,  poststr, sep = "")
                commandstr = paste0("limma::", commandstr)
                cont.matrix <- eval(parse(text = commandstr))
                aDGEList <- edgeR::estimateGLMCommonDisp(aDGEList, design)
                aDGEList <- edgeR::estimateGLMTagwiseDisp(aDGEList,  design)

                aGlmFit <- edgeR::glmFit(
                    aDGEList,
                    design,
                    dispersion = aDGEList$tagwise.dispersion,
                    prior.count.total = 0
                )

                tableDEA <- list()
                for (mycoef in colnames(cont.matrix)) {
                    message(paste0("DEA for", " :", mycoef))
                    aGlmLRT <- edgeR::glmLRT(aGlmFit, contrast = cont.matrix[, mycoef])
                    tt <- aGlmLRT$table
                    tt <-  cbind(tt, FDR = p.adjust(aGlmLRT$table$PValue,"fdr"))
                    tt <- tt[(tt$FDR < fdr.cut & abs(as.numeric(tt$logFC)) > logFC.cut),]
                    tableDEA[[as.character(mycoef)]] <- tt
                    if (all(grepl("ENSG", rownames(tableDEA[[as.character(mycoef)]]))))
                        tableDEA[[as.character(mycoef)]] <-
                        cbind(tableDEA[[as.character(mycoef)]],
                              map.ensg(genes = rownames(tableDEA[[as.character(mycoef)]]))[,2:3])
                }
            }
        } else {
            stop(
                paste0(
                    method,
                    " is not a valid DEA method option. Choose 'exactTest' or 'glmLRT' "
                )
            )
        }
    } else if (pipeline == "limma") {
        if (log.trans == TRUE){
            logCPM <- edgeR::cpm(TOC, log = TRUE, prior.count = 3)
        } else {
            logCPM <- TOC
        }

        if (voom == TRUE) {
            message("Voom Transformation...")
            logCPM <- limma::voom(logCPM, design)
        }

        if (length(unique(tumorType)) == 2) {
            colnames(design)[1:2] <- c(Cond1type, Cond2type)
            contr <- paste0(Cond2type, "-", Cond1type)
            cont.matrix <- limma::makeContrasts(contrasts = contr, levels = design)
            fit <- limma::lmFit(logCPM, design)
            fit <- limma::contrasts.fit(fit, cont.matrix)

            if (trend == TRUE) {
                fit <- limma::eBayes(fit, trend = TRUE)
            } else {
                fit <- limma::eBayes(fit, trend = FALSE)
            }
            tableDEA <- limma::topTable(
                fit,
                coef = 1,
                adjust.method = "fdr",
                number = nrow(TOC)
            )
            limma::volcanoplot(fit, highlight = 10)
            index <- which(tableDEA[, 4] < fdr.cut)
            tableDEA <- tableDEA[index,]
            neg_logFC.cut <- -1 * logFC.cut
            index <- which(abs(as.numeric(tableDEA$logFC)) > logFC.cut)
            tableDEA <- tableDEA[index,]
        }
        else if (length(unique(tumorType)) > 2) {
            DGE <- edgeR::DGEList(TOC, group = tumorType)
            colnames(design)[1:length(levels(tumorType))] <- levels(tumorType)
            prestr = "makeContrasts("
            poststr = ",levels=colnames(design))"
            commandstr = paste(prestr, contrast.formula, poststr, sep = "")
            commandstr = paste0("limma::", commandstr)
            cont.matrix <- eval(parse(text = commandstr))
            fit <- limma::lmFit(logCPM, design)
            fit <- limma::contrasts.fit(fit, cont.matrix)
            if (trend == TRUE){
                fit <- limma::eBayes(fit, trend = TRUE)
            } else {
                fit <- limma::eBayes(fit, trend = FALSE)
            }

            tableDEA <- list()
            for (mycoef in colnames(cont.matrix)) {
                tableDEA[[as.character(mycoef)]] <- limma::topTable(
                    fit,
                    coef = mycoef,
                    adjust.method = "fdr",
                    number = nrow(MAT)
                )
                message(paste0("DEA for", " :", mycoef))
                tempDEA <- tableDEA[[as.character(mycoef)]]
                index.up <- which(tempDEA$adj.P.Val < fdr.cut & abs(as.numeric(tempDEA$logFC)) > logFC.cut)
                tableDEA[[as.character(mycoef)]] <- tempDEA[index.up,]
                if (all(grepl("ENSG", rownames(tableDEA[[as.character(mycoef)]]))))
                    tableDEA[[as.character(mycoef)]] <-
                    cbind(tableDEA[[as.character(mycoef)]],
                          map.ensg(genes = rownames(tableDEA[[as.character(mycoef)]]))[,  2:3])
            }
        }
    }  else {
        stop(paste0(
            pipeline,
            " is not a valid pipeline option. Choose 'edgeR' or 'limma'"
        ))
    }
    message("----------------------- END DEA -------------------------------")
    return(tableDEA)
}


#' @title Batch correction using ComBat and Voom transformation using limma package.
#' @description
#'    TCGAbatch_correction allows user to perform a Voom correction on gene expression data and have it ready for DEA.
#'    One can also use ComBat for batch correction for exploratory analysis. If batch.factor or adjustment argument is "Year"
#'  please provide clinical data. If no batch factor is provided, the data will be voom corrected only
#'
#'     TCGAanalyze_DEA performs DEA using following functions from sva and limma:
#'     \enumerate{
#'     \item limma::voom Transform RNA-Seq Data Ready for Linear Modelling.
#'     \item sva::ComBat Adjust for batch effects using an empirical Bayes framework.
#'     }
#' @param tabDF numeric matrix, each row represents a gene,
#' each column represents a sample
#' @param batch.factor a string containing the batch factor to use for correction.
#'  Options are "Plate", "TSS", "Year", "Portion", "Center"
#' @param adjustment vector containing strings for factors to adjust for using ComBat.
#' Options are "Plate", "TSS", "Year", "Portion", "Center"
#' @param UnpublishedData if TRUE perform a batch correction after adding new data
#' @param ClinicalDF a dataframe returned by GDCquery_clinic() to be used to extract year data
#' @param AnnotationDF a dataframe with column Batch indicating different batches of the samples in the tabDF
#' @export
#' @return data frame with ComBat batch correction applied
TCGAbatch_Correction <- function (
        tabDF,
        batch.factor = NULL,
        adjustment = NULL,
        ClinicalDF = data.frame(),
        UnpublishedData = FALSE,
        AnnotationDF = data.frame()
){
    check_package("sva")

    # code for non-TCGA samples
    if (UnpublishedData == TRUE) {
        if(!"Batch" %in% colnames(AnnotationDF)) {
            stop("AnnotationDF should have a Batch column")
        } else {
            batch.factor <- as.factor(AnnotationDF$Batch)
        }

        if(!"Condition" %in% colnames(AnnotationDF)) {
            mod <- model.matrix(~as.factor(Condition),data = AnnotationDF)
        } else {
            mod <- NULL
        }

        batch_corr <- sva::ComBat(
            dat = tabDF,
            batch = batch.factor,
            mod = mod,
            par.prior = TRUE,
            prior.plots = TRUE
        )
    }

    if (UnpublishedData == FALSE) {
        if (length(batch.factor) == 0 & length(adjustment) == 0)
            message("batch correction will be skipped")
        else if (batch.factor %in% adjustment) {
            stop(paste0("Cannot adjust and correct for the same factor|"))
        }
        my_IDs <- get_IDs(tabDF)
        if (length(batch.factor) > 0 || length(adjustment) > 0)
            if ((nrow(ClinicalDF) > 0 & batch.factor == "Year") ||
                ("Year" %in% adjustment == TRUE & nrow(ClinicalDF) >
                 0)) {
                names(ClinicalDF)[names(ClinicalDF) == "bcr_patient_barcode"] <-
                    "patient"
                ClinicalDF$age_at_diag_year <-
                    floor(ClinicalDF$age_at_diagnosis / 365)
                ClinicalDF$diag_year <-
                    ClinicalDF$age_at_diag_year +
                    ClinicalDF$year_of_birth
                diag_yearDF <- ClinicalDF[, c("patient", "diag_year")]
                Year <- merge(my_IDs, diag_yearDF, by = "patient")
                Year <- Year$diag_year
                Year <- as.factor(Year)
            }
        else if (nrow(ClinicalDF) == 0 & batch.factor == "Year") {
            stop("Cannot extract Year data. Clinical data was not provided")
        }
        Plate <- as.factor(my_IDs$plate)
        Condition <- as.factor(my_IDs$condition)
        TSS <- as.factor(my_IDs$tss)
        Portion <- as.factor(my_IDs$portion)
        Sequencing.Center <- as.factor(my_IDs$center)
        design.mod.combat <- model.matrix( ~ Condition)
        options <- c("Plate", "TSS", "Year", "Portion", "Sequencing Center")

        if (length(batch.factor) > 1)
            stop("Combat can only correct for one batch variable. Provide one batch factor")
        if (batch.factor %in% options == FALSE)
            stop(paste0(o, " is not a valid batch correction factor"))

        for (o in adjustment) {
            if (o %in% options == FALSE)
                stop(paste0(o, " is not a valid adjustment factor"))

        }
        adjustment.data <- c()
        for (a in adjustment) {
            if (a == "Sequencing Center")
                a <- Sequencing.Center
            adjustment.data <-
                cbind(eval(parse(text = a)), adjustment.data)
        }
        if (batch.factor == "Sequencing Center")
            batch.factor <- Sequencing.Center
        batchCombat <- eval(parse(text = batch.factor))
        if (length(adjustment) > 0) {
            adjustment.formula <- paste(adjustment, collapse = "+")
            adjustment.formula <- paste0("+", adjustment.formula)
            adjustment.formula <- paste0("~Condition", adjustment.formula)
            print(adjustment.formula)
            model <- data.frame(batchCombat, row.names = colnames(tabDF))
            design.mod.combat <-  model.matrix(
                eval(parse(text = adjustment.formula)),
                data = model
            )
        }
        print(unique(batchCombat))
        batch_corr <- sva::ComBat(
            dat = tabDF,
            batch = batchCombat,
            mod = design.mod.combat,
            par.prior = TRUE,
            prior.plots = TRUE
        )
    }

    return(batch_corr)
}

##Function to take raw counts by removing rows filtered after norm and filter process###

#' @title Use raw count from the DataPrep object which genes are removed by normalization and filtering steps.
#' @description function to keep raw counts after filtering and/or normalizing.
#' @param DataPrep DataPrep object returned by TCGAanalyze_Preprocessing()
#' @param DataFilt Filtered data frame containing samples in columns and genes in rows after normalization and/or filtering steps
#' @examples
#' \dontrun{
#'   dataPrep_raw <- UseRaw_afterFilter(dataPrep, dataFilt)
#' }
#' @export
#' @return Filtered return object similar to DataPrep with genes removed after normalization and filtering process.
UseRaw_afterFilter <- function(DataPrep, DataFilt) {
    rownames(DataPrep) <-
        lapply(rownames(DataPrep), function(x)
            gsub("[[:punct:]]\\d*", "", x))
    filtered.list <- setdiff(rownames(DataPrep), rownames(DataFilt))
    Res <- DataPrep[!rownames(DataPrep) %in% filtered.list,]
    return(Res)
}

#' @title Adding information related to DEGs genes from DEA as mean values in two conditions.
#' @description
#'    TCGAanalyze_LevelTab allows user to add information related to DEGs genes from
#'    Differentially expression analysis (DEA) such as mean values and in two conditions.
#' @param FC_FDR_table_mRNA Output of dataDEGs filter by abs(LogFC) >=1
#' @param typeCond1 a string containing the class label of the samples
#'  in TableCond1  (e.g., control group)
#' @param typeCond2 a string containing the class label of the samples
#' in TableCond2  (e.g., case group)
#' @param TableCond1 numeric matrix, each row represents a gene, each column
#'  represents a sample with Cond1type
#' @param TableCond2 numeric matrix, each row represents a gene, each column
#' represents a sample with Cond2type
#' @param typeOrder typeOrder
#' @export
#' @return table with DEGs, log Fold Change (FC), false discovery rate (FDR),
#' the gene expression level
#' for samples in  Cond1type, and Cond2type, and Delta value (the difference
#' of gene expression between the two
#' conditions multiplied logFC)
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut =  0.25)
#' samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
#' samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
#' dataDEGs <- TCGAanalyze_DEA(
#'   dataFilt[,samplesNT],
#'   dataFilt[,samplesTP],
#'   Cond1type = "Normal",
#'   Cond2type = "Tumor"
#' )
#' dataDEGsFilt <- dataDEGs[abs(dataDEGs$logFC) >= 1,]
#' dataTP <- dataFilt[,samplesTP]
#' dataTN <- dataFilt[,samplesNT]
#' dataDEGsFiltLevel <- TCGAanalyze_LevelTab(
#'   FC_FDR_table_mRNA = dataDEGsFilt,
#'   typeCond1 = "Tumor",
#'   typeCond2 = "Normal",
#'   TableCond1 = dataTP,
#'   TableCond2 = dataTN
#' )
TCGAanalyze_LevelTab <- function(
        FC_FDR_table_mRNA,
        typeCond1,
        typeCond2,
        TableCond1,
        TableCond2,
        typeOrder = TRUE
) {
    TableLevel <- data.frame(
        "mRNA" = rownames(FC_FDR_table_mRNA),
        "logFC" = FC_FDR_table_mRNA$logFC,
        "FDR" = FC_FDR_table_mRNA$FDR,
        "Delta" =  FC_FDR_table_mRNA$logFC * rowMeans(TableCond1[rownames(FC_FDR_table_mRNA),],na.rm = TRUE)
    )
    TableLevel[[typeCond1]] <- rowMeans(TableCond1[TableLevel$mRNA,],na.rm = TRUE)
    TableLevel[[typeCond2]] <- rowMeans(TableCond2[TableLevel$mRNA,],na.rm = TRUE)

    TableLevel <- TableLevel[order(as.numeric(TableLevel[, "Delta"]), decreasing = typeOrder),]
    rownames(TableLevel) <-  TableLevel[, "mRNA"]

    if (all(grepl("ENSG", rownames(TableLevel))))
        TableLevel <- cbind(TableLevel, map.ensg(genes = rownames(TableLevel))[, 2:3])

    return(TableLevel)
}

#' @title Enrichment analysis for Gene Ontology (GO) [BP,MF,CC] and Pathways
#' @description
#'   Researchers, in order to better understand the underlying biological
#'   processes, often want to retrieve a functional profile of a set of genes
#'   that might have an important role. This can be done by performing an
#'   enrichment analysis.
#'
#'We will perform an enrichment analysis on gene sets using the TCGAanalyze_EAcomplete
#'function. Given a set of genes that are
#'up-regulated under certain conditions, an enrichment analysis will find
#'identify classes of genes or proteins that are #'over-represented using
#'annotations for that gene set.
#' @param TFname is the name of the list of genes or TF's regulon.
#' @param RegulonList List of genes such as TF's regulon or DEGs where to find enrichment.
#' @export
#' @return Enrichment analysis GO[BP,MF,CC] and Pathways complete table enriched by genelist.
#' @examples
#' Genelist <- c("FN1","COL1A1")
#' ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist)
#' \dontrun{
#' Genelist <- rownames(dataDEGsFiltLevel)
#' system.time(ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist))
#' }
TCGAanalyze_EAcomplete <- function(TFname, RegulonList) {
    # This is a verification of the input
    # in case the List is like Gene|ID
    # we will get only the Gene
    if (all(grepl("\\|", RegulonList))) {
        RegulonList <- strsplit(RegulonList, "\\|")
        RegulonList <- unlist(lapply(RegulonList, function(x)
            x[1]))
    }

    print(
        paste(
            "I need about ",
            "1 minute to finish complete ",
            "Enrichment analysis GO[BP,MF,CC] and Pathways... "
        )
    )

    ResBP <- TCGAanalyze_EA(TFname, RegulonList, DAVID_BP_matrix,
                            EAGenes, GOtype = "DavidBP")
    print("GO Enrichment Analysis BP completed....done")
    ResMF <- TCGAanalyze_EA(TFname, RegulonList, DAVID_MF_matrix,
                            EAGenes, GOtype = "DavidMF")
    print("GO Enrichment Analysis MF completed....done")
    ResCC <- TCGAanalyze_EA(TFname, RegulonList, DAVID_CC_matrix,
                            EAGenes, GOtype = "DavidCC")
    print("GO Enrichment Analysis CC completed....done")
    ResPat <- TCGAanalyze_EA(TFname, RegulonList, listEA_pathways,
                             EAGenes, GOtype = "Pathway")
    print("Pathway Enrichment Analysis completed....done")

    ans <-
        list(
            ResBP = ResBP,
            ResMF = ResMF,
            ResCC = ResCC,
            ResPat = ResPat
        )
    return(ans)
}

#' @title Enrichment analysis of a gene-set with GO [BP,MF,CC]  and pathways.
#' @description
#' The rational behind a enrichment analysis ( gene-set, pathway etc) is to compute
#' statistics of whether the overlap between the focus list (signature) and the gene-set
#' is significant. ie the confidence that overlap between the list is not due to chance.
#'  The Gene Ontology project describes genes (gene products) using terms from
#'  three structured vocabularies: biological process, cellular component and molecular function.
#'  The Gene Ontology Enrichment component, also referred to as the GO Terms" component, allows
#'  the genes in any such "changed-gene" list to be characterized using the Gene Ontology terms
#'  annotated to them. It asks, whether for any particular GO term, the fraction of genes
#'  assigned to it in the "changed-gene" list is higher than expected by chance
#'  (is over-represented), relative to the fraction of genes assigned to that term in the
#'  reference set.
#'  In statistical terms it perform the analysis tests the null hypothesis that,
#'  for any particular ontology term, there is no difference in the proportion of genes
#'  annotated to it in the reference list and the proportion annotated to it in the test list.
#'  We adopted a Fisher Exact Test to perform the EA.
#' @param GeneName is the name of gene signatures list
#' @param TableEnrichment is a table related to annotations of gene symbols such as
#' GO[BP,MF,CC] and Pathways. It was created from DAVID gene ontology on-line.
#' @param RegulonList is a gene signature (lisf of genes) in which perform EA.
#' @param GOtype is type of gene ontology Biological process (BP), Molecular Function (MF),
#' Cellular componet (CC)
#' @param FDRThresh pvalue corrected (FDR) as threshold to selected significant
#' BP, MF,CC, or pathways. (default FDR < 0.01)
#' @param EAGenes is a table with informations about genes
#' such as ID, Gene, Description, Location and Family.
#' @param GeneSymbolsTable if it is TRUE will return a table with GeneSymbols in common GO or pathways.
# @export
#' @import stats
#' @return Table with enriched GO or pathways by selected gene signature.
#' @examples
#' \dontrun{
#' EAGenes <- get("EAGenes")
#' RegulonList <- rownames(dataDEGsFiltLevel)
#' ResBP <- TCGAanalyze_EA(
#'   GeneName="DEA genes Normal Vs Tumor",
#'   RegulonList = RegulonList,
#'   TableEnrichment = DAVID_BP_matrix,
#'    EAGenes = EAGenes,
#'    GOtype = "DavidBP"
#'  )
#'}
TCGAanalyze_EA <- function (
        GeneName,
        RegulonList,
        TableEnrichment,
        EAGenes,
        GOtype,
        FDRThresh = 0.01,
        GeneSymbolsTable = FALSE
) {
    topPathways <- nrow(TableEnrichment)
    topPathways_tab <- matrix(0, 1, topPathways)
    topPathways_tab <- as.matrix(topPathways_tab)
    rownames(topPathways_tab) <- GeneName
    rownames(EAGenes) <- toupper(rownames(EAGenes))
    EAGenes <- EAGenes[!duplicated(EAGenes[, "ID"]),]
    rownames(EAGenes) <- EAGenes[, "ID"]
    allgene <- EAGenes[, "ID"]
    current_pathway_from_EA <- as.matrix(TableEnrichment[, GOtype])
    TableNames <- gsub("David",
                       "",
                       paste("Top ", GOtype, " n. ",
                             1:topPathways, " of ", topPathways, sep = ""))
    colnames(topPathways_tab) <- TableNames
    topPathways_tab <- as.data.frame(topPathways_tab)
    table_pathway_enriched <-  matrix(1, nrow(current_pathway_from_EA),  8)
    colnames(table_pathway_enriched) <- c(
        "Pathway",
        "GenesInPathway",
        "Pvalue",
        "FDR",
        "CommonGenesPathway",
        "PercentPathway",
        "PercentRegulon",
        "CommonGeneSymbols"
    )
    table_pathway_enriched <- as.data.frame(table_pathway_enriched)
    for (i in 1:nrow(current_pathway_from_EA)) {
        table_pathway_enriched[i, "Pathway"] <-
            as.character(current_pathway_from_EA[i,])
        if (nrow(TableEnrichment) == 589) {
            genes_from_current_pathway_from_EA <-
                GeneSplitRegulon(TableEnrichment[TableEnrichment[GOtype] ==
                                                     as.character(current_pathway_from_EA[i,]),][,
                                                                                                 "Molecules"], ",")
        }
        else {
            genes_from_current_pathway_from_EA <-
                GeneSplitRegulon(TableEnrichment[TableEnrichment[GOtype] ==
                                                     as.character(current_pathway_from_EA[i,]),][,
                                                                                                 "Molecules"], ", ")
        }
        genes_common_pathway_TFregulon <-  as.matrix(intersect(
            toupper(RegulonList),
            toupper(genes_from_current_pathway_from_EA)
        ))
        if (length(genes_common_pathway_TFregulon) != 0) {
            current_pathway_commongenes_num <-
                length(genes_common_pathway_TFregulon)
            seta <- allgene %in% RegulonList
            setb <- allgene %in% genes_from_current_pathway_from_EA
            ft <- fisher.test(seta, setb)
            FisherpvalueTF <- ft$p.value
            table_pathway_enriched[i, "Pvalue"] <-
                as.numeric(FisherpvalueTF)
            if (FisherpvalueTF < 0.01) {
                current_pathway_commongenes_percent <- paste(
                    "(",
                    format((
                        current_pathway_commongenes_num / length(genes_from_current_pathway_from_EA)
                    ) *
                        100,
                    digits = 2
                    ), "%)"
                )
                current_pathway_commongenes_num_with_percent <-
                    gsub(
                        " ",
                        "",
                        paste(
                            current_pathway_commongenes_num,
                            current_pathway_commongenes_percent,
                            "pv=",
                            format(FisherpvalueTF, digits = 2)
                        )
                    )
                table_pathway_enriched[i, "CommonGenesPathway"] <-
                    length(genes_common_pathway_TFregulon)
                table_pathway_enriched[i, "CommonGeneSymbols"] <-
                    paste0(genes_common_pathway_TFregulon, collapse = ";")
                table_pathway_enriched[i, "GenesInPathway"] <-
                    length(genes_from_current_pathway_from_EA)
                table_pathway_enriched[i, "PercentPathway"] <-
                    as.numeric(table_pathway_enriched[i,
                                                      "CommonGenesPathway"]) /
                    as.numeric(table_pathway_enriched[i,
                                                      "GenesInPathway"]) * 100
                table_pathway_enriched[i, "PercentRegulon"] <-
                    as.numeric(table_pathway_enriched[i,
                                                      "CommonGenesPathway"]) /
                    length(RegulonList) *
                    100
            }
        }
    }
    table_pathway_enriched <-
        table_pathway_enriched[order(table_pathway_enriched[,
                                                            "Pvalue"], decreasing = FALSE),]
    table_pathway_enriched <-
        table_pathway_enriched[table_pathway_enriched[,
                                                      "Pvalue"] < 0.01,]
    table_pathway_enriched[, "FDR"] <-
        p.adjust(table_pathway_enriched[,
                                        "Pvalue"], method = "fdr")
    table_pathway_enriched <-
        table_pathway_enriched[table_pathway_enriched[,
                                                      "FDR"] < FDRThresh,]
    table_pathway_enriched <-
        table_pathway_enriched[order(table_pathway_enriched[,
                                                            "FDR"], decreasing = FALSE),]

    table_pathway_enriched_filt <-
        table_pathway_enriched[table_pathway_enriched$FDR < FDRThresh, ]

    if (nrow(table_pathway_enriched) > 0) {
        tmp <- table_pathway_enriched
        tmp <- paste(
            tmp[, "Pathway"],
            "; FDR= ",
            format(tmp[,
                       "FDR"], digits = 3),
            "; (ng=",
            round(tmp[, "GenesInPathway"]),
            "); (ncommon=",
            format(tmp[, "CommonGenesPathway"],
                   digits = 2),
            ")",
            sep = ""
        )
        tmp <- as.matrix(tmp)
        topPathways_tab <-
            topPathways_tab[, 1:nrow(table_pathway_enriched),
                            drop = FALSE]
        topPathways_tab[1,] <- tmp
    }
    else {
        topPathways_tab <- NA
    }

    if (GeneSymbolsTable == FALSE) {
        return(topPathways_tab)
    }

    if (GeneSymbolsTable == TRUE) {
        return(table_pathway_enriched_filt)
    }
}

#' @title Differentially expression analysis (DEA) using limma package.
#' @description Differentially expression analysis (DEA) using limma package.
#' @param FC.cut write
#' @param AffySet A matrix-like data object containing log-ratios or log-expression values
#' for a series of arrays, with rows corresponding to genes and columns to samples
#' @examples
#' \dontrun{
#' to add example
#' }
#' @export
#' @return List of list with tables in 2 by 2 comparison
#' of the top-ranked genes from a linear model fitted by DEA's limma
TCGAanalyze_DEA_Affy <- function(AffySet, FC.cut = 0.01) {
    if (!requireNamespace("Biobase", quietly = TRUE)) {
        stop("Biobase package is needed for this function to work. Please install it.",
             call. = FALSE)
    }
    if (!requireNamespace("limma", quietly = TRUE)) {
        stop("limma package is needed for this function to work. Please install it.",
             call. = FALSE)
    }
    Pdatatable <- Biobase::phenoData(AffySet)

    f <- factor(Pdatatable$Disease)
    groupColors <- names(table(f))

    tmp <- matrix(0, length(groupColors), length(groupColors))
    colnames(tmp) <- groupColors
    rownames(tmp) <- groupColors
    tmp[upper.tri(tmp)] <- 1


    sample_tab <- Pdatatable
    f <- factor(Pdatatable$Disease)
    design <- model.matrix( ~ 0 + f)
    colnames(design) <- levels(f)
    fit <- limma::lmFit(AffySet, design) ## fit is an object of class MArrayLM.

    groupColors <- names(table(Pdatatable$Disease))

    CompleteList <- vector("list", sum(tmp))

    k <- 1

    for (i in 1:length(groupColors)) {
        col1 <- colnames(tmp)[i]
        for (j in 1:length(groupColors)) {
            col2 <- rownames(tmp)[j]

            if (i != j) {
                if (tmp[i, j] != 0) {
                    Comparison <- paste(col2, "-", col1, sep = "")

                    if (i == 4 &&
                        j == 6) {
                        Comparison <- paste(col1, "-", col2, sep = "")
                    }
                    if (i == 5 &&
                        j == 6) {
                        Comparison <- paste(col1, "-", col2, sep = "")
                    }

                    print(paste(i, j, Comparison, "to do..."))

                    cont.matrix <- limma::makeContrasts(I = Comparison, levels = design)

                    fit2 <- limma::contrasts.fit(fit, cont.matrix)
                    fit2 <- limma::eBayes(fit2)



                    sigI <-  limma::topTable(
                        fit2,
                        coef = 1,
                        adjust.method = "BH",
                        sort.by = "B",
                        p.value = 0.05,
                        lfc = FC.cut,
                        number = 50000
                    )

                    sigIbis <-   sigI[order(abs(as.numeric(sigI$logFC)), decreasing = TRUE), ]
                    names(CompleteList)[k] <- gsub("-", "_", Comparison)
                    CompleteList[[k]] <- sigIbis
                    k <- k + 1
                }
            }
        }

    }

    return(CompleteList)
}


#' @title Generate network
#' @description TCGAanalyze_analyseGRN perform gene regulatory network.
#' @param TFs a vector of genes.
#' @param normCounts is a matrix of gene expression with genes in rows and samples in columns.
#' @param kNum the number of nearest neighbors to consider to estimate the mutual information.
#' Must be less than the number of columns of normCounts.
#' @export
#' @return an adjacent matrix
TCGAanalyze_analyseGRN <- function(TFs, normCounts, kNum) {

    if (!requireNamespace("parmigene", quietly = TRUE)) {
        stop(
            "parmigene package is needed for this function to work. Please install it.",
            call. = FALSE
        )
    }
    MRcandidates <- intersect(rownames(normCounts), TFs)

    # Mutual information between TF and genes
    sampleNames <- colnames(normCounts)
    geneNames <- rownames(normCounts)

    messageMI_TFgenes <-  paste(
        "Estimation of MI among [",
        length(MRcandidates),
        " TRs and ",
        nrow(normCounts),
        " genes].....",
        sep = ""
    )
    timeEstimatedMI_TFgenes1 <- length(MRcandidates) * nrow(normCounts) / 1000
    timeEstimatedMI_TFgenes <-  format(timeEstimatedMI_TFgenes1 * ncol(normCounts) / 17000, digits = 2)
    messageEstimation <- print(
        paste(
            "I Need about ",
            timeEstimatedMI_TFgenes,
            "seconds for this MI estimation. [Processing 17000k elements /s]  "
        )
    )

    miTFGenes <- parmigene::knnmi.cross(normCounts[MRcandidates,], normCounts, k = kNum)

    return(miTFGenes)

}

#' @title Generate pathview graph
#' @description TCGAanalyze_Pathview pathway based data integration and visualization.
#' @param dataDEGs dataDEGs
#' @param pathwayKEGG pathwayKEGG
#' @export
#' @return an adjacent matrix
#' @examples
#' \dontrun{
#'   dataDEGs <- data.frame(mRNA = c("TP53","TP63","TP73"), logFC = c(1,2,3))
#'   TCGAanalyze_Pathview(dataDEGs)
#' }
TCGAanalyze_Pathview <-
    function(dataDEGs, pathwayKEGG = "hsa05200") {
        if (!requireNamespace("clusterProfiler", quietly = TRUE)) {
            stop("clusterProfiler needed for this function to work. Please install it.",
                 call. = FALSE)
        }
        if (!requireNamespace("pathview", quietly = TRUE)) {
            stop("pathview needed for this function to work. Please install it.",
                 call. = FALSE)
        }
        # Converting Gene symbol to gene ID
        eg = as.data.frame(
            clusterProfiler::bitr(
                dataDEGs$mRNA,
                fromType = "SYMBOL",
                toType = "ENTREZID",
                OrgDb = "org.Hs.eg.db"
            )
        )
        eg <- eg[!duplicated(eg$SYMBOL), ]
        dataDEGs <- dataDEGs[dataDEGs$mRNA %in% eg$SYMBOL, ]
        dataDEGs <- dataDEGs[order(dataDEGs$mRNA, decreasing = FALSE), ]
        eg <- eg[order(eg$SYMBOL, decreasing = FALSE), ]
        dataDEGs$GeneID <- eg$ENTREZID
        dataDEGsFiltLevel_sub <-
            subset(dataDEGs, select = c("GeneID", "logFC"))
        genelistDEGs <- as.numeric(dataDEGsFiltLevel_sub$logFC)
        names(genelistDEGs) <- dataDEGsFiltLevel_sub$GeneID

        hsa05200 <- pathview::pathview(
            gene.data  = genelistDEGs,
            pathway.id = pathwayKEGG,
            species    = "hsa",
            limit      = list(gene = as.integer(max(
                abs(genelistDEGs)
            )))
        )

    }


#' @title infer gene regulatory networks
#' @description TCGAanalyze_networkInference taking expression data as input,
#' this will return an adjacency matrix of interactions
#' @param data expression data, genes in columns, samples in rows
#' @param optionMethod inference method, chose from aracne, c3net, clr and mrnet
#' @export
#' @return an adjacent matrix
TCGAanalyze_networkInference <-
    function(data, optionMethod = "clr") {
        # Converting Gene symbol to gene ID
        if (optionMethod == "c3net") {
            if (!requireNamespace("c3net", quietly = TRUE)) {
                stop(
                    "c3net package is needed for this function to work. Please install it.",
                    call. = FALSE
                )
            }

            net <- c3net::c3net(t(data))
        } else {
            if (!requireNamespace("minet", quietly = TRUE)) {
                stop(
                    "minet package is needed for this function to work. Please install it.",
                    call. = FALSE
                )
            }
            net <- minet::minet(data, method = optionMethod)
        }
        return(net)

    }


#' Creates a plot for GAIA output (all significant aberrant regions.)
#' @description
#' This function is a auxiliary function to visualize GAIA output
#' (all significant aberrant regions.)
#' @param calls A matrix with the following columns: Chromossome, Aberration Kind
#' Region Start, Region End, Region Size and score
#' @param threshold Score threshold (orange horizontal line in the plot)
#' @export
#' @importFrom graphics abline axis legend plot points
#' @return A plot with all significant aberrant regions.
#' @examples
#' call <- data.frame("Chromossome" = rep(9,100),
#'                    "Aberration Kind" = rep(c(-2,-1,0,1,2),20),
#'                    "Region Start [bp]" = 18259823:18259922,
#'                    "Region End [bp]" = 18259823:18259922,
#'                    "score" = rep(c(1,2,3,4),25))
#'  gaiaCNVplot(call,threshold = 0.01)
#'  call <- data.frame("Chromossome" = rep(c(1,9),50),
#'                     "Aberration Kind" = rep(c(-2,-1,0,1,2),20),
#'                     "Region Start [bp]" = 18259823:18259922,
#'                     "Region End [bp]" = 18259823:18259922,
#'                     "score" = rep(c(1,2,3,4),25))
#'  gaiaCNVplot(call,threshold = 0.01)
gaiaCNVplot <- function (calls,  threshold = 0.01) {
    Calls <-  calls[order(calls[, grep("start", colnames(calls), ignore.case = TRUE)]), ]
    Calls <- Calls[order(Calls[, grep("chr", colnames(calls), ignore.case = TRUE)]), ]
    rownames(Calls) <- NULL
    Chromo <- Calls[, grep("chr", colnames(calls), ignore.case = TRUE)]
    Gains <- apply(Calls, 1, function(x)
        ifelse(x[grep("aberration", colnames(calls), ignore.case = TRUE)] == 1, x["score"], 0))
    Losses <- apply(Calls, 1, function(x)
        ifelse(x[grep("aberration", colnames(calls), ignore.case = TRUE)] == 0, x["score"], 0))
    plot(
        Gains,
        ylim = c(-max(Calls[, "score"] + 2), max(Calls[, "score"] + 2)),
        type = "h",
        col = "red",
        xlab = "Chromosome",
        ylab = "Score",
        xaxt = "n"
    )
    points(-(Losses), type = "h", col = "blue")
    # Draw origin line
    abline(h = 0, cex = 4)
    # Draw threshold lines
    abline(
        h = -log10(threshold),
        col = "orange",
        cex = 4,
        main = "test"
    )
    abline(
        h = log10(threshold),
        col = "orange",
        cex = 4,
        main = "test"
    )

    uni.chr <- unique(Chromo)
    temp <- rep(0, length(uni.chr))
    for (i in 1:length(uni.chr)) {
        temp[i] <- max(which(uni.chr[i] == Chromo))
    }
    for (i in 1:length(temp)) {
        abline(v = temp[i],
               col = "black",
               lty = "dashed")
    }
    nChroms <- length(uni.chr)
    begin <- c()
    for (d in 1:nChroms) {
        chrom <- sum(Chromo == uni.chr[d])
        begin <- append(begin, chrom)
    }
    temp2 <- rep(0, nChroms)
    for (i in 1:nChroms) {
        if (i == 1) {
            temp2[1] <- (begin[1] * 0.5)
        }
        else if (i > 1) {
            temp2[i] <- temp[i - 1] + (begin[i] * 0.5)
        }
    }
    uni.chr[uni.chr == 23] <- "X"
    uni.chr[uni.chr == 24] <- "Y"
    for (i in 1:length(temp)) {
        axis(1,
             at = temp2[i],
             labels = uni.chr[i],
             cex.axis = 1)
    }
    legend(
        x = 1,
        y = max(Calls[, "score"] + 2),
        y.intersp = 0.8,
        c("Amp"),
        pch = 15,
        col = c("red"),
        text.font = 3
    )
    legend(
        x = 1,
        y = -max(Calls[, "score"] + 0.5),
        y.intersp = 0.8,
        c("Del"),
        pch = 15,
        col = c("blue"),
        text.font = 3
    )
}

#' Get a matrix of interactions of genes from biogrid
#' @description
#' Using biogrid database, it will create a matrix of gene interactions.
#' If columns A and row B has value 1, it means the gene A and gene B interacts.
#' @param tmp.biogrid Biogrid table
#' @export
#' @param names.genes List of genes to filter from output. Default: consider all genes
#' @return A matrix with 1 for genes that interacts, 0 for no interaction.
#' @examples
#' names.genes.de <- c("PLCB1","MCL1","PRDX4","TTF2","TACC3", "PARP4","LSM1")
#' tmp.biogrid <- data.frame("Official.Symbol.Interactor.A" = names.genes.de,
#'                           "Official.Symbol.Interactor.B" = rev(names.genes.de))
#' net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de)
#' \dontrun{
#'   file <- paste0("http://thebiogrid.org/downloads/archives/",
#'                  "Release%20Archive/BIOGRID-3.4.133/BIOGRID-ALL-3.4.133.tab2.zip")
#'   downloader::download(file,basename(file))
#'   unzip(basename(file),junkpaths =TRUE)
#'   tmp.biogrid <- read.csv(gsub("zip","txt",basename(file)),
#'                           header=TRUE, sep="\t", stringsAsFactors=FALSE)
#'   names.genes.de <- c("PLCB1","MCL1","PRDX4","TTF2","TACC3", "PARP4","LSM1")
#'   net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de)
#' }
getAdjacencyBiogrid <- function(tmp.biogrid, names.genes = NULL) {
    it.a <- grep("Symbol", colnames(tmp.biogrid), value = TRUE)[1]
    it.b <- grep("Symbol", colnames(tmp.biogrid), value = TRUE)[2]

    if (is.null(names.genes)) {
        names.genes <-
            sort(union(unique(tmp.biogrid[, it.a]), unique(tmp.biogrid[, it.b])))
        ind <- seq(1, nrow(tmp.biogrid))
    } else {
        ind.A <- which(tmp.biogrid[, it.a] %in% names.genes)
        ind.B <- which(tmp.biogrid[, it.b] %in% names.genes)
        ind <- intersect(ind.A, ind.B)
    }

    mat.biogrid <- matrix(
        0,
        nrow = length(names.genes),
        ncol = length(names.genes),
        dimnames = list(names.genes, names.genes)
    )

    for (i in ind) {
        mat.biogrid[tmp.biogrid[i, it.a], tmp.biogrid[i, it.b]] <-
            mat.biogrid[tmp.biogrid[i, it.b], tmp.biogrid[i, it.a]] <- 1
    }
    diag(mat.biogrid) <- 0

    return(mat.biogrid)
}

#' @title Get GDC primary tumors samples with both DNA methylation (HM450K) and Gene expression data
#' @description
#' For a given TCGA project it gets the  primary tumors samples (barcode) with both
#' DNA methylation and Gene expression data from GDC database
#' @param project A GDC project
#' @param n Number of samples to return. If NULL return all (default)
#' @return A vector of barcodes
#' @export
#' @examples
#' # Get ACC samples with both  DNA methylation (HM450K) and gene expression aligned to hg19
#' samples <- matchedMetExp("TCGA-UCS")
matchedMetExp <- function(
        project,
        n = NULL
) {

    # get primary solid tumor samples: DNA methylation
    message("Download DNA methylation information")
    met450k <- GDCquery(
        project = project,
        data.category = "DNA Methylation",
        platform = "Illumina Human Methylation 450",
        sample.type = c("Primary Tumor")
    )

    # get primary solid tumor samples: RNAseq
    message("Download gene expression information")
    exp <- GDCquery(
        project = project,
        data.category = "Transcriptome Profiling",
        data.type = "Gene Expression Quantification",
        workflow.type = "STAR - Counts"
    )

    # Get patients with samples in both platforms
    met450k_tp <- met450k$results[[1]]$cases
    exp_tp <- exp$results[[1]]$cases

    patients <-
        substr(exp_tp, 1, 15)[
            substr(exp_tp, 1, 15) %in% substr(met450k_tp, 1, 15)
        ] |> unique()

    if (!is.null(n)) {
        patients <- patients[1:n] # get only n samples
    }

    return(patients)
}
BioinformaticsFMRP/TCGAbiolinks documentation built on April 12, 2024, 2:08 a.m.