R/methylation.R

Defines functions .fun TCGAvisualize_meanMethylation TCGAanalyze_survival

Documented in TCGAanalyze_survival TCGAvisualize_meanMethylation

#' @title Creates survival analysis
#' @description Creates a survival plot from TCGA patient clinical data
#' using survival library. It uses the fields days_to_death and vital, plus a
#' columns for groups.
#'
#' @param data TCGA Clinical patient with the information days_to_death
#' @param clusterCol Column with groups to plot. This is a mandatory field, the
#' caption will be based in this column
#' @param legend Legend title of the figure
#' @param xlim x axis limits e.g. xlim = c(0, 1000). Present narrower X axis, but not affect survival estimates.
#' @param main main title of the plot
#' @param labels labels of the plot
#' @param ylab y axis text of the plot
#' @param xlab x axis text of the plot
#' @param filename The name of the pdf file.
#' @param color Define the colors/Pallete for lines.
#' @param risk.table show or not the risk table
#' @param width Image width
#' @param height Image height
#' @param pvalue show p-value of log-rank test
#' @param conf.int  show confidence intervals for point estimates of survival curves.
#' @param ... Further arguments passed to \link[survminer]{ggsurvplot}.
#' @param dpi Figure quality
#' @export
#' @return Survival plot
#' @examples
#'  # clin <- GDCquery_clinic("TCGA-BRCA","clinical")
#'  clin <- data.frame(
#'       vital_status = c("alive","alive","alive","dead","alive",
#'                        "alive","dead","alive","dead","alive"),
#'       days_to_death = c(NA,NA,NA,172,NA,NA,3472,NA,786,NA),
#'       days_to_last_follow_up = c(3011,965,718,NA,1914,423,NA,5,656,1417),
#'       gender = c(rep("male",5),rep("female",5))
#'  )
#' TCGAanalyze_survival(clin, clusterCol="gender")
#' TCGAanalyze_survival(clin, clusterCol="gender", xlim = 1000)
#' TCGAanalyze_survival(clin,
#'                      clusterCol="gender",
#'                      risk.table = FALSE,
#'                      conf.int = FALSE,
#'                      color = c("pink","blue"))
#' TCGAanalyze_survival(clin,
#'                      clusterCol="gender",
#'                      risk.table = FALSE,
#'                      xlim = c(100,1000),
#'                      conf.int = FALSE,
#'                      color = c("Dark2"))
TCGAanalyze_survival <- function(
    data,
    clusterCol = NULL,
    legend = "Legend",
    labels = NULL,
    risk.table = TRUE,
    xlim = NULL,
    main = "Kaplan-Meier Overall Survival Curves",
    ylab = "Probability of survival",
    xlab = "Time since diagnosis (days)",
    filename = "survival.pdf",
    color = NULL,
    height = 8,
    width = 12,
    dpi = 300,
    pvalue = TRUE,
    conf.int = TRUE,
    ...) {
    .e <- environment()

    check_package("survminer")
    check_package("survival")
    check_package("gridExtra")

    if (!all(
        c(
            "vital_status",
            "days_to_death",
            "days_to_last_follow_up"
        ) %in% colnames(data)
    ))
        stop(
            "Columns vital_status, days_to_death and  days_to_last_follow_up should be in data frame"
        )

    if (is.null(color)) {
        color <- rainbow(length(unique(data[, clusterCol])))
    }

    group <- NULL
    if (is.null(clusterCol)) {
        stop("Please provide the clusterCol argument")
    } else if (length(unique(data[, clusterCol])) == 1) {
        stop(
            paste0(
                "Sorry, but I'm expecting at least two groups\n",
                "  Only this group found: ",
                unique(data[, clusterCol])
            )
        )
    }
    notDead <- is.na(data$days_to_death)

    if (any(notDead == TRUE)) {
        data[notDead, "days_to_death"] <-
            data[notDead, "days_to_last_follow_up"]
    }
    if (length(data[which((data[, "days_to_death"] < 0) == T), "sample"]) > 0 &
        "sample" %in% colnames(data)) {
        message(
            "Incosistencies in the data were found. The following samples have a negative days_to_death value:"
        )
        message(paste(data[which((data[, "days_to_death"] < 0) == T), "sample"], collapse = ", "))
    }
    if (any(is.na(data[, "days_to_death"])) &
        "sample" %in% colnames(data)) {
        message(
            "Incosistencies in the data were found. The following samples have a NA days_to_death value:"
        )
        message(paste(data[is.na(data[, "days_to_death"]), "sample"], collapse = ", "))
    }

    # create a column to be used with survival package, info need
    # to be TRUE(DEAD)/FALSE (ALIVE)
    data$s <- grepl("dead|deceased", data$vital_status, ignore.case = TRUE)

    # Column with groups
    data$type <- as.factor(data[, clusterCol])
    data <-  data[, c("days_to_death", "s", "type")]
    # create the formula for survival analysis
    f.m <-
        formula(survival::Surv(as.numeric(data$days_to_death), event = data$s) ~ data$type)
    fit <- do.call(survival::survfit, list(formula = f.m, data = data))

    label.add.n <- function(x) {
        na.idx <- is.na(data[, "days_to_death"])
        negative.idx <- data[, "days_to_death"] < 0
        idx <- !(na.idx | negative.idx)
        return(paste0(x, " (n = ",
                      sum(data[idx, "type"] == x), ")"))
    }

    if (is.null(labels)) {
        d <- survminer::surv_summary(fit, data = data)
        order <-
            unname(sapply(levels(d$strata), function(x)
                unlist(str_split(x, "="))[2]))
        labels <- sapply(order, label.add.n)
    }
    if (length(xlim) == 1) {
        xlim <- c(0, xlim)
    }
    suppressWarnings({
        surv <- survminer::ggsurvplot(
            fit,
            # survfit object with calculated statistics.
            risk.table = risk.table,
            # show risk table.
            pval = pvalue,
            # show p-value of log-rank test.
            conf.int = conf.int,
            # show confidence intervals for point estimaes of survival curves.
            xlim = xlim,
            # present narrower X axis, but not affect survival estimates.
            main = main,
            # Title
            xlab = xlab,
            # customize X axis label.
            legend.title = legend,
            # Legend title
            legend.labs = labels,
            # change legend labels.
            palette =  color,
            # custom color palettes.
            ...
        )
    })


    if (!is.null(filename)) {
        ggsave(
            surv$plot,
            filename = filename,
            width = width,
            height = height,
            dpi = dpi
        )
        message(paste0("File saved as: ", filename))
        if (risk.table) {
            g1 <- ggplotGrob(surv$plot)
            g2 <- ggplotGrob(surv$table)
            min_ncol <- min(ncol(g2), ncol(g1))
            g <-
                gridExtra::gtable_rbind(g1[, 1:min_ncol], g2[, 1:min_ncol], size = "last")
            ggsave(
                g,
                filename = filename,
                width = width,
                height = height,
                dpi = dpi
            )
        }
    } else {
        return(surv)
    }
}
#' @title Mean methylation boxplot
#' @description
#'   Creates a mean methylation boxplot for groups (groupCol),
#'   subgroups will be highlighted as shapes if the subgroupCol was set.
#'
#'   Observation: Data is a summarizedExperiment.
#'
#' @param data SummarizedExperiment object obtained from TCGAPrepare
#' @param groupCol Columns in colData(data) that defines the groups. If no
#' columns defined a columns called "Patients" will be used
#' @param subgroupCol Columns in colData(data) that defines the subgroups.
#' @param shapes Shape vector of the subgroups. It must have the size of the levels
#' of the subgroups. Example: shapes = c(21,23) if for two levels
#' @param filename The name of the pdf that will be saved
#' @param subgroup.legend Name of the subgroup legend. DEFAULT: subgroupCol
#' @param group.legend Name of the group legend. DEFAULT: groupCol
#' @param color vector of colors to be used in graph
#' @param title main title in the plot
#' @param ylab y axis text in the plot
#' @param print.pvalue Print p-value for two groups
#' @param xlab x axis text in the plot
#' @param labels Labels of the groups
#' @param sort Sort boxplot by mean or median.
#' Possible values: mean.asc, mean.desc, median.asc, median.desc
#' @param plot.jitter Plot jitter? Default TRUE
#' @param jitter.size Plot jitter size? Default 3
#' @param height Plot height default:10
#' @param width Plot width default:10
#' @param dpi Pdf dpi default:600
#' @param order Order of the boxplots
#' @param axis.text.x.angle Angle of text in the x axis
#' @param y.limits Change lower/upper y-axis limit
#' @param legend.position Legend position ("top", "right","left","bottom")
#' @param legend.title.position  Legend title position ("top", "right","left","bottom")
#' @param legend.ncols Number of columns of the legend
#' @param add.axis.x.text Add text to x-axis? Default: FALSE
#' @import ggplot2 stats
#' @importFrom SummarizedExperiment colData rowRanges assay
#' @importFrom grDevices rainbow
# ' @importFrom gtools combinations
#' @importFrom plyr ddply . summarize
#' @importFrom knitr kable
#' @export
#' @return Save the pdf survival plot
#' @examples
#' nrows <- 200; ncols <- 21
#' counts <- matrix(runif(nrows * ncols, 0, 1), nrows)
#' rowRanges <- GenomicRanges::GRanges(rep(c("chr1", "chr2"), c(50, 150)),
#'                    IRanges::IRanges(floor(runif(200, 1e5, 1e6)), width=100),
#'                     strand=sample(c("+", "-"), 200, TRUE),
#'                     feature_id=sprintf("ID%03d", 1:200))
#'colData <- S4Vectors::DataFrame(Treatment=rep(c("ChIP", "Input","Other"), 7),
#'                     row.names=LETTERS[1:21],
#'                     group=rep(c("group1","group2","group3"),c(7,7,7)),
#'                     subgroup=rep(c("subgroup1","subgroup2","subgroup3"),7))
#'data <- SummarizedExperiment::SummarizedExperiment(
#'          assays=S4Vectors::SimpleList(counts=counts),
#'          rowRanges=rowRanges,
#'          colData=colData)
#' TCGAvisualize_meanMethylation(data,groupCol  = "group")
#' # change lower/upper y-axis limit
#' TCGAvisualize_meanMethylation(data,groupCol  = "group", y.limits = c(0,1))
#' # change lower y-axis limit
#' TCGAvisualize_meanMethylation(data,groupCol  = "group", y.limits = 0)
#' TCGAvisualize_meanMethylation(data,groupCol  = "group", subgroupCol="subgroup")
#' TCGAvisualize_meanMethylation(data,groupCol  = "group")
#' TCGAvisualize_meanMethylation(data,groupCol  = "group",sort="mean.desc",filename="meandesc.pdf")
#' TCGAvisualize_meanMethylation(data,groupCol  = "group",sort="mean.asc",filename="meanasc.pdf")
#' TCGAvisualize_meanMethylation(data,groupCol  = "group",sort="median.asc",filename="medianasc.pdf")
#' TCGAvisualize_meanMethylation(data,groupCol  = "group",sort="median.desc",filename="mediandesc.pdf")
TCGAvisualize_meanMethylation <- function(
    data,
    groupCol = NULL,
    subgroupCol = NULL,
    shapes = NULL,
    print.pvalue = FALSE,
    plot.jitter = TRUE,
    jitter.size = 3,
    filename = "groupMeanMet.pdf",
    ylab = expression(paste("Mean DNA methylation (",
                            beta, "-values)")),
    xlab = NULL,
    title = "Mean DNA methylation",
    labels = NULL,
    group.legend = NULL,
    subgroup.legend = NULL,
    color = NULL,
    y.limits = NULL,
    sort,
    order,
    legend.position = "top",
    legend.title.position = "top",
    legend.ncols = 3,
    add.axis.x.text = TRUE,
    width = 10,
    height = 10,
    dpi = 600,
    axis.text.x.angle = 90) {
    .e <- environment()
    mean <- colMeans(assay(data), na.rm = TRUE)

    if (is.null(groupCol)) {
        groups <- rep("Patient", length(mean))
    } else {
        if (!(groupCol %in% colnames(colData(data))))
            stop("groupCol not found in the object")
        groups <- colData(data)[, groupCol]
    }

    if (is.null(subgroupCol)) {
        subgroups <- NULL
    } else {
        if (!(subgroupCol %in% colnames(colData(data))))
            stop("subgroupCol not found in the object")
        subgroups <- colData(data)[, subgroupCol]
    }

    if (!is.null(subgroupCol)) {
        df <-
            data.frame(
                mean = mean,
                groups = groups,
                subgroups = subgroups,
                samples = colnames(data)
            )
    } else {
        df <-
            data.frame(mean = mean,
                       groups = groups,
                       samples = colnames(data))
    }
    message("==================== DATA Summary ====================")
    data.summary <- ddply(
        df,
        .(groups),
        summarize,
        Mean = mean(mean),
        Median = median(mean),
        Max = max(mean),
        Min = min(mean)
    )
    print(data.summary)
    message("==================== END DATA Summary ====================")

    groups <- unique(df$groups)
    mat.pvalue <- matrix(
        ncol = length(groups),
        nrow = length(groups),
        dimnames = list(groups, groups)
    )

    if (length(groups) > 1) {
        comb2by2 <- t(combn(unique(df$groups), 2))

        for (i in 1:nrow(comb2by2)) {
            try({
                aux <- t.test(mean ~ groups,
                              data = subset(df, subset = df$groups %in% comb2by2[i, ]))$p.value

                mat.pvalue[comb2by2[i, 1], comb2by2[i, 2]] <- aux
                mat.pvalue[comb2by2[i, 2], comb2by2[i, 1]] <- aux
            },
            silent = TRUE)
        }
        message("==================== T test results ====================")
        print(mat.pvalue)
        message("==================== END T test results ====================")

    }
    if (print.pvalue & length(unique(df$groups)) == 2) {
        pvalue <- t.test(mean ~ groups, data = df)$p.value
    } else {
        print.pvalue <- FALSE
    }
    # Plot for methylation analysis Axis x: LGm clusters Axis y:
    # mean methylation
    label.add.n <- function(x) {
        paste0(x, " (n = ",
               nrow(subset(df, subset = (
                   df$groups == x
               ))), ")")
    }
    if (is.null(group.legend)) {
        group.legend <- groupCol
    }
    if (is.null(subgroup.legend)) {
        subgroup.legend <- subgroupCol
    }

    if (missing(sort)) {
        if (missing(order)) {
            x <- factor(df$groups)
        } else {
            x <- factor(df$groups, levels = order)
        }
    } else if (sort == "mean.asc") {
        x <- reorder(df$groups, df$mean, FUN = "mean")
    } else  if (sort == "mean.desc") {
        x <- reorder(df$groups, -df$mean, FUN = "mean")
    } else if (sort == "median.asc") {
        x <- reorder(df$groups, df$mean, FUN = "median")
    } else if (sort == "median.desc") {
        x <- reorder(df$groups, -df$mean, FUN = "median")
    }
    x <- as.character(x)
    if (is.null(labels)) {
        labels <- unique(x)
        labels <- sapply(labels, label.add.n)
    }

    if (is.null(color)) {
        color <- rainbow(length(labels))
        color <-
            color[(match(unique(x), unique(as.character(df$groups))))]
    }
    print(x)
    print(df$groups)
    print(color)
    p <- ggplot(df, aes(x, df$mean),
                environment = .e) +
        geom_boxplot(aes(fill = x),
                     notchwidth = 0.25,
                     outlier.shape = NA)

    if (plot.jitter) {
        if (!is.null(subgroupCol)) {
            p <- p + geom_jitter(
                aes(shape = subgroups,
                    size =  subgroups),
                position = position_jitter(width = 0.1),
                size = jitter.size
            )
        } else {
            p <- p + geom_jitter(position = position_jitter(width = 0.1),
                                 size = jitter.size)
        }
    }
    if (add.axis.x.text) {
        axis.text.x <- element_text(angle = axis.text.x.angle,
                                    vjust = 0.5)
    } else {
        axis.text.x <-  element_blank()
    }
    p <- p + scale_fill_manual(
        values = color,
        labels = labels,
        name = group.legend
    )
    p <- p + scale_x_discrete(limits = levels(x))
    p <- p + ylab(ylab) + xlab(xlab) + labs(title = title) +
        labs(shape = subgroup.legend, color = group.legend) +
        theme_minimal() +
        theme(
            axis.text.x = axis.text.x,
            plot.title = element_text(face = "bold", hjust = 0.5),
            legend.position = legend.position,
            legend.key = element_rect(colour = 'white')
        ) +
        guides(
            fill = guide_legend(
                ncol = legend.ncols,
                title.position = legend.title.position,
                title.hjust = 0.5
            )
        )

    if (!is.null(shapes)) {
        p <- p + scale_shape_manual(values = shapes)
    }

    if (print.pvalue) {
        p <- p + annotate(
            "text",
            x = -Inf,
            y = -Inf,
            hjust = -0.1,
            vjust = -1.0,
            size = 5,
            label = paste0("P-value = ",
                           format(
                               pvalue, scientific = TRUE,
                               digits = 2
                           ))
        )
    }
    if (!is.null(y.limits)) {
        p <- p + expand_limits(y = y.limits)
    }

    # saving box plot to analyse it
    if (!is.null(filename)) {
        ggsave(
            p,
            filename = filename,
            width = width,
            height = height,
            dpi = dpi
        )
        message(paste("Plot saved in: ", file.path(getwd(), filename)))
    } else {
        return(p)
    }
}

#' @title Calculate pvalues
#' @description Calculate pvalues using wilcoxon test
#' @details
#'    Verify if the data is significant between two groups. For the methylation
#'    we search for probes that have a difference in the mean methylation and
#'    also a significant value.
#'    Input: A SummarizedExperiment object that will be used to
#'    compared two groups with wilcoxon test, a boolean value to do a
#'    paired or non-paired test
#'    Output: p-values (non-adj/adj) histograms, p-values (non-adj/adj)
#' @param data  SummarizedExperiment obtained from the TCGAPrepare
#' @param groupCol  Columns with the groups inside the SummarizedExperiment
#'  object. (This will be obtained by the function colData(data))
#' @param group1 In case our object has more than 2 groups, you should set the
#'  groups
#' @param group2 In case our object has more than 2 groups, you should set the
#'  groups
#' @param paired  Do a paired wilcoxon test? Default: True
#' @param adj.method P-value adjustment method. Default:"BH" Benjamini-Hochberg
#' @param alternative wilcoxon test alternative
#' @param cores Number of cores to be used
#' @return Data frame with cols p values/p values adjusted
#' @import graphics
#' @importFrom grDevices png dev.off pdf
#' @import stats
#' @importFrom SummarizedExperiment colData rowRanges rowRanges<- colData<-
#' @return Data frame with two cols
#'         p-values/p-values adjusted
#' @examples
#' nrows <- 200; ncols <- 20
#'  counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows,
#'            dimnames = list(paste0("cg",1:200),LETTERS[1:20]))
#' rowRanges <- GenomicRanges::GRanges(rep(c("chr1", "chr2"), c(50, 150)),
#'                    IRanges::IRanges(floor(runif(200, 1e5, 1e6)), width=100),
#'                     strand=sample(c("+", "-"), 200, TRUE),
#'                     feature_id=sprintf("ID%03d", 1:200))
#' colData <- S4Vectors::DataFrame(Treatment=rep(c("ChIP", "Input"), 10),
#'                     row.names=LETTERS[1:20],
#'                     group=rep(c("group1","group2"),c(10,10)))
#' data <- SummarizedExperiment::SummarizedExperiment(
#'          assays=S4Vectors::SimpleList(counts=counts),
#'          rowRanges=rowRanges,
#'          colData=colData)
#' results <- TCGAbiolinks:::dmc.non.parametric.se(data,"group")
#' @importFrom plyr adply
#' @importFrom stats wilcox.test
#' @keywords internal
dmc.non.parametric.se <- function(
    data,
    groupCol = NULL,
    group1 = NULL,
    group2 = NULL,
    paired = FALSE,
    adj.method = "BH",
    alternative = "two.sided",
    cores = 1
) {

    if (is.null(groupCol)) {
        message("Please, set the groupCol parameter")
        return(NULL)
    }

    # Can we define group1 and group2 automatically
    if(is.null(group1) & is.null(group2)){
        if(length(unique(colData(data)[, groupCol])) == 2){
            message("Defining groups")
            group1 <- unique(colData(data)[, groupCol])[1]
            group2 <- unique(colData(data)[, groupCol])[2]
        } else {
            message("Please, set the group1 and group2 parameters")
            return(NULL)
        }
    }

    message("Calculating the p-values of each probe...")
    # Apply Wilcoxon test in order to calculate the p-values
    idx1 <- which(colData(data)[, groupCol] == group1)
    idx2 <- which(colData(data)[, groupCol] == group2)

    if (!is.factor(colData(data)[, groupCol])) {
        colData(data)[, groupCol] <- factor(colData(data)[, groupCol])
    }
    m <- assay(data)
    df <- dmc.non.parametric(
        matrix = m,
        idx1 = idx1,
        idx2 =  idx2,
        paired = paired,
        adj.method = adj.method,
        alternative =  alternative,
        cores =  cores
    )

    group1.col <- gsub("[[:punct:]]| ", ".", group1)
    group2.col <- gsub("[[:punct:]]| ", ".", group2)
    colp <- paste("p.value",  group1.col,  group2.col, sep = ".")
    coladj <- paste("p.value.adj", group1.col,  group2.col, sep = ".")
    coldiffmean <- paste("mean", group1.col, "minus.mean", group2.col, sep = ".")
    colmeang1 <- paste("mean", group1.col, sep = ".")
    colmeang2 <- paste("mean", group2.col, sep = ".")

    colnames(df) <- c(colmeang1, colmeang2, coldiffmean, colp, coladj)
    return(df)
}

#' @title Perform non-parametrix wilcoxon test
#' @description Perform non-parametrix wilcoxon test
#' @param matrix  A matrix
#' @param idx1  Index columns group1
#' @param idx2  Index columns group2
#' @param paired  Do a paired wilcoxon test? Default: True
#' @param adj.method P-value adjustment method. Default:"BH" Benjamini-Hochberg
#' @param alternative wilcoxon test alternative
#' @param cores Number of cores to be used
#' @return Data frame with p-values and diff mean
#' @import stats
#' @examples
#'  nrows <- 200; ncols <- 20
#'  counts <- matrix(
#'    runif(nrows * ncols, 1, 1e4), nrows,
#'    dimnames = list(paste0("cg",1:200),paste0("S",1:20))
#'  )
#'  TCGAbiolinks:::dmc.non.parametric(counts,1:10,11:20)
dmc.non.parametric <-  function(
    matrix,
    idx1 = NULL,
    idx2 = NULL,
    paired = FALSE,
    adj.method = "BH",
    alternative = "two.sided",
    cores = 1)
{
    parallel <- set_cores(cores)

    p.value <- plyr::adply(
        .data = matrix,
        .margins =  1,
        .fun = function(x) {
            suppressMessages({
                wilcox.test(
                    x[idx1],
                    x[idx2],
                    paired = paired,
                    alternative = alternative)$p.value
            })
        }, .progress = "time", .parallel = parallel)

    p.value <- p.value[, 2]
    p.value.adj <- p.adjust(p.value, method = adj.method)
    mean.g1 <- rowMeans(matrix[, idx1, drop = FALSE], na.rm = TRUE)
    mean.g2 <- rowMeans(matrix[, idx2, drop = FALSE], na.rm = TRUE)
    mean.g1_minus_mean.g2 <- mean.g1 - mean.g2

    return(
        data.frame(
            mean.g1,
            mean.g2,
            mean.g1_minus_mean.g2,
            p.value,
            p.value.adj
        )
    )
}

#' @title Creates a volcano plot for DNA methylation or expression
#' @description Creates a volcano plot from the
#' expression and methylation analysis.
#' @details
#'    Creates a volcano plot from the expression and methylation analysis.
#'    Please see the vignette for more information
#'    Observation: This function automatically is called by TCGAanalyse_DMR
#' @param x x-axis data
#' @param y y-axis data
#' @param y.cut p-values threshold.
#' @param x.cut  x-axis threshold. Default: 0.0 If you give only one number (e.g. 0.2) the cut-offs will be
#'  -0.2 and 0.2. Or you can give different cut-offs as a vector (e.g. c(-0.3,0.4))
#' @param filename Filename. Default: volcano.pdf, volcano.svg, volcano.png
#' @param legend Legend title
#' @param color vector of colors to be used in graph
#' @param title main title. If not specified it will be
#' "Volcano plot (group1 vs group2)
#' @param ylab y axis text
#' @param xlab x axis text
#' @param xlim x limits to cut image
#' @param ylim y limits to cut image
#' @param height Figure height
#' @param width Figure width
#' @param names Names to be plotted if significant.
#' Should be the same size of x and y
#' @param names.fill Names should be filled in a color box?  Default: TRUE
#' @param names.size Size of the names text
#' @param dpi Figure dpi
#' @param label vector of labels to be used in the figure.
#' Example: c("Not Significant","Hypermethylated in group1",
#' "Hypomethylated in group1"))#'
#' @param highlight List of genes/probes to be highlighted. It should be in the names argument.
#' @param highlight.color Color of the points highlighted
#' @param show.names What names will be showed? Possibilities: "both", "significant", "highlighted"
#' @export
#' @return Saves the volcano plot in the current folder
#' @examples
#' x <- runif(200, -1, 1)
#' y <- runif(200, 0.01, 1)
#' TCGAVisualize_volcano(x,y)
#' \dontrun{
#' TCGAVisualize_volcano(
#'   x,
#'   y,
#'   filename = NULL,
#'   y.cut = 10000000,
#'   x.cut=0.8,
#'    names = rep("AAAA",length(x)),
#'    legend = "Status",
#'    names.fill = FALSE
#' )
#'
#' TCGAVisualize_volcano(
#'   x,
#'   y,
#'   filename = NULL,
#'   y.cut = 10000000,
#'   x.cut = 0.8,
#'    names = as.character(1:length(x)),
#'    legend = "Status",
#'    names.fill = TRUE, highlight = c("1","2"),
#'    show = "both"
#' )
#' TCGAVisualize_volcano(
#'   x,
#'   y,
#'   filename = NULL,
#'   y.cut = 10000000,
#'   x.cut = c(-0.3,0.8),
#'   names = as.character(1:length(x)),
#'   legend = "Status",
#'   names.fill = TRUE,
#'   highlight = c("1","2"),
#'   show = "both"
#' )
#' }
#' while (!(is.null(dev.list()["RStudioGD"]))){dev.off()}
TCGAVisualize_volcano <- function(
    x,
    y,
    filename = "volcano.pdf",
    ylab =  expression(paste(-Log[10]," (FDR corrected -P values)")),
    xlab = NULL,
    title = "Volcano plot",
    legend = NULL,
    label = NULL,
    xlim = NULL,
    ylim = NULL,
    color = c("black", "red", "green"),
    names = NULL,
    names.fill = TRUE,
    show.names = "significant",
    x.cut = 0,
    y.cut = 0.01,
    height = 5,
    width = 10,
    highlight = NULL,
    highlight.color = "orange",
    names.size = 4,
    dpi = 300)
{
    if (!is.null(names)) {
        if (all(grepl("\\|", names))) {
            names <- strsplit(names, "\\|")
            names <- unlist(lapply(names, function(x)
                x[1]))
        }
    }
    .e <- environment()
    threshold <- rep("1", length(x))
    names(color) <- as.character(1:3)

    if (is.null(label)) {
        label = c(
            "1" = "Not Significant",
            "2" = "Up regulated",
            "3" = "Down regulated"
        )
    } else  {
        names(label) <- as.character(1:3)
    }
    # get significant data
    sig <-  y < y.cut
    sig[is.na(sig)] <- FALSE

    # If x.cut
    if (length(x.cut) == 1) {
        x.cut.min <- -x.cut
        x.cut.max <- x.cut
    }
    if (length(x.cut) == 2) {
        x.cut.min <- x.cut[1]
        x.cut.max <- x.cut[2]
    }

    # hypermethylated/up regulated samples compared to old state
    up <- x  > x.cut.max
    up[is.na(up)] <- FALSE
    if (any(up & sig)) threshold[up & sig] <- "2"

    # hypomethylated/ down regulated samples compared to old state
    down <-  x < x.cut.min
    down[is.na(down)] <- FALSE
    if (any(down & sig)) threshold[down & sig] <- "3"

    if (!is.null(highlight)) {
        idx <- which(names %in% highlight)
        if (length(idx) > 0) {
            threshold[which(names %in% highlight)]  <- "4"
            color <- c(color, highlight.color)
            names(color) <- as.character(1:4)
        }
    }

    df <- data.frame(
        x = x,
        y = y,
        threshold = threshold
    )

    # As last color should be the highlighthed, we need to order all the vectors
    if (!is.null(highlight)) {
        order.idx <-  order(df$threshold)
        down <- down[order.idx]
        sig <- sig[order.idx]
        up <- up[order.idx]
        df <- df[order.idx,]
        names <- names[order.idx]
    }

    # Plot a volcano plot
    p <- ggplot(
        data = df,
        aes(
            x = x ,
            y = -1 * log10(y),
            colour = threshold
        ),
        environment = .e
    ) +
        geom_point() +
        ggtitle(title) + ylab(ylab) + xlab(xlab) +
        geom_vline(aes(xintercept = x.cut.min),
                   colour = "black",
                   linetype = "dashed") +
        geom_vline(aes(xintercept = x.cut.max),
                   colour = "black",
                   linetype = "dashed") +
        geom_hline(aes(yintercept = -1 * log10(y.cut)),
                   colour = "black",
                   linetype = "dashed") +
        scale_color_manual(
            breaks = as.numeric(names(label)),
            values = color,
            labels = label,
            name = legend
        ) +
        theme_bw() + theme(
            panel.border = element_blank(),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            legend.text = element_text(size = 10),
            plot.title = element_text(hjust = 0.5),
            axis.line.x = element_line(colour = "black"),
            axis.line.y = element_line(colour = "black"),
            legend.position = "top",
            legend.key = element_rect(colour = 'white')
        )

    # Label points with the textxy function from the calibrate plot
    if (!is.null(names)) {
        # With the names the user can highlight the significant genes, up and down
        # or the ones highlighted
        if (show.names == "significant") {
            idx <- (up & sig) | (down & sig)
            important <- c("2", "3")
        } else if (show.names == "highlighted") {
            if (!is.null(highlight)) {
                idx <- (names %in% highlight)
                important <- c("4")
            } else {
                message("Missing highlight argument")
                return(NULL)
            }
        } else if (show.names == "both") {
            if (!is.null(highlight)) {
                idx <- (up & sig) | (down & sig) |  (names %in% highlight)
                important <- c("2", "3", "4")
            } else {
                message("Missing highlight argument")
                return(NULL)
            }
        } else {
            message("Wrong highlight argument")
            return(NULL)
        }

        if (any(threshold %in% important)) {
            check_package("ggrepel")
            if (names.fill) {
                p <- p + ggrepel::geom_label_repel(
                    data = subset(df, threshold %in% important),
                    aes(label = names[idx], fill = threshold),
                    size = names.size,
                    show.legend = FALSE,
                    fontface = 'bold',
                    color = 'white',
                    box.padding = unit(0.35, "lines"),
                    segment.colour = "grey",
                    point.padding = unit(0.3, "lines")
                ) +   scale_fill_manual(values = color[as.numeric(important)])
            }  else {
                p <- p + ggrepel::geom_text_repel(
                    data = subset(df, threshold %in% important),
                    aes(label = names[idx]),
                    size = names.size,
                    show.legend = FALSE,
                    segment.colour = "grey",
                    fontface = 'bold',
                    color = 'black',
                    point.padding = unit(0.3, "lines"),
                    box.padding = unit(0.5, 'lines')
                )
            }
        }
    }

    if (!is.null(filename)) {
        ggsave(
            p,
            filename = filename,
            width = width,
            height = height,
            dpi = dpi
        )
    } else {
        return(p)
    }
}


#' @title Differentially methylated regions Analysis
#' @description
#'   This function will search for differentially methylated CpG sites,
#'   which are regarded as possible functional regions involved
#'   in gene transcriptional regulation.
#'
#'   In order to find these regions we use the beta-values (methylation values
#'   ranging from 0.0 to 1.0) to compare two groups.
#'
#'   Firstly, it calculates the difference between the mean methylation of each
#'   group for each probes. Secondly, it calculates the p-value using the
#'   wilcoxon test using the Benjamini-Hochberg adjustment method.
#'   The default parameters will require a minimum absolute beta values delta
#'   of 0.2 and a false discovery rate (FDR)-adjusted Wilcoxon rank-sum P-value
#'   of < 0.01 for the difference.
#'
#'   After these analysis, we save a volcano plot (x-axis:diff mean methylation,
#'   y-axis: significance) that will help the user identify the differentially
#'   methylated CpG sites and return the object with the calculus in the rowRanges.
#'
#'   If the calculus already exists in the object it will not recalculated.
#'   You should set overwrite parameter to TRUE to force it, or remove the
#'   columns with the results from the object.
#'
#' @param data  SummarizedExperiment obtained from the TCGAPrepare
#' @param groupCol  Columns with the groups inside the SummarizedExperiment
#'  object. (This will be obtained by the function colData(data))
#' @param group1 In case our object has more than 2 groups, you should set
#' the name of the group
#' @param group2 In case our object has more than 2 groups, you should set
#' the name of the group
#' @param plot.filename Filename. Default: volcano.pdf, volcano.svg, volcano.png. If set to FALSE, there will be no plot.
#' @param legend Legend title
#' @param color vector of colors to be used in graph
#' @param title main title. If not specified it will be
#' "Volcano plot (group1 vs group2)
#' @param ylab y axis text
#' @param xlab x axis text
#' @param xlim x limits to cut image
#' @param ylim y limits to cut image
#' @param label vector of labels to be used in the figure.
#' Example: c("Not Significant","Hypermethylated in group1",
#' "Hypomethylated in group1"))
#' @param p.cut p values threshold. Default: 0.01
#' @param probe.names is probe.names
#' @param diffmean.cut diffmean threshold. Default: 0.2
#' @param adj.method Adjusted method for the p-value calculation
#' @param paired Wilcoxon paired parameter. Default: FALSE
#' @param alternative wilcoxon test alternative
#' @param save Save object with results? Default: TRUE
#' @param save.directory Directory to save the files. Default: working directory
#' @param filename Name of the file to save the object.
#' @param cores Number of cores to be used in the non-parametric test
#' Default = groupCol.group1.group2.rda
#' @import ggplot2
#' @importFrom SummarizedExperiment colData rowRanges assay rowRanges<- values<- SummarizedExperiment metadata<-
#' @importFrom S4Vectors metadata
#' @importFrom dplyr data_frame
#' @importFrom methods as
#' @import readr
#' @import utils
#' @export
#' @return Volcano plot saved and the given data with the results
#' (diffmean.group1.group2,p.value.group1.group2,
#' p.value.adj.group1.group2,status.group1.group2)
#' in the rowRanges where group1 and group2 are the names of the groups
#' @examples
#' nrows <- 200; ncols <- 20
#'  counts <- matrix(
#'    runif(nrows * ncols, 1, 1e4), nrows,
#'    dimnames = list(paste0("cg",1:200),paste0("S",1:20))
#' )
#' rowRanges <- GenomicRanges::GRanges(
#'   rep(c("chr1", "chr2"), c(50, 150)),
#'   IRanges::IRanges(floor(runif(200, 1e5, 1e6)), width = 100),
#'   strand = sample(c("+", "-"), 200, TRUE),
#'   feature_id = sprintf("ID%03d", 1:200)
#' )
#' colData <- S4Vectors::DataFrame(
#'   Treatment = rep(c("ChIP", "Input"), 5),
#'   row.names = LETTERS[1:20],
#'   group = rep(c("group1","group2"),c(10,10))
#' )
#'data <- SummarizedExperiment::SummarizedExperiment(
#'          assays=S4Vectors::SimpleList(counts=counts),
#'          rowRanges = rowRanges,
#'          colData = colData
#' )
#' SummarizedExperiment::colData(data)$group <- c(rep("group 1",ncol(data)/2),
#'                          rep("group 2",ncol(data)/2))
#' hypo.hyper <- TCGAanalyze_DMC(data, p.cut = 0.85,"group","group 1","group 2")
#' SummarizedExperiment::colData(data)$group2 <- c(rep("group_1",ncol(data)/2),
#'                          rep("group_2",ncol(data)/2))
#' hypo.hyper <- TCGAanalyze_DMC(
#'   data = data,
#'   p.cut = 0.85,
#'   groupCol = "group2",
#'   group1 = "group_1",
#'   group2 = "group_2"
#' )
TCGAanalyze_DMC <- function(
    data,
    groupCol = NULL,
    group1 = NULL,
    group2 = NULL,
    alternative = "two.sided",
    diffmean.cut = 0.2,
    paired = FALSE,
    adj.method = "BH",
    plot.filename = "methylation_volcano.pdf",
    ylab =  expression(paste(-Log[10], " (FDR corrected -P values)")),
    xlab =  expression(paste("DNA Methylation difference (", beta, "-values)")),
    title = NULL,
    legend = "Legend",
    color = c("black",  "red", "darkgreen"),
    label = NULL,
    xlim = NULL,
    ylim = NULL,
    p.cut = 0.01,
    probe.names = FALSE,
    cores = 1,
    save = TRUE,
    save.directory = ".",
    filename = NULL)
{
    names(color) <- as.character(1:3)
    # Check if object is a summarized Experiment
    if (!is(data, "RangedSummarizedExperiment")) {
        stop(
            paste0(
                "Sorry, but I'm expecting a Summarized Experiment object, but I got a: ",
                class(data)
            )
        )
    }

    # Check if object has NAs for all samples
    if (any(rowSums(!is.na(assay(data))) == 0)) {
        stop(
            paste0(
                "Sorry, but we found some probes with NA ",
                "for all samples in your data, please either ",
                "remove/or replace them"
            )
        )
    }

    if (is.null(groupCol)) {
        message("Please, set the groupCol parameter")
        return(NULL)
    }

    if (!(groupCol %in% colnames(colData(data)))) {
        stop(paste0("column ", groupCol, " not found in the object"))
    }

    if (length(unique(colData(data)[, groupCol])) != 2 &&
        is.null(group1) && is.null(group2)) {
        message("Please, set the group1 and group2 parameters")
        return(NULL)
    } else if (length(unique(colData(data)[, groupCol])) == 2  &&
               (is.null(group1) || is.null(group2)))
    {
        group1 <- unique(colData(data)[, groupCol])[1]
        group2 <- unique(colData(data)[, groupCol])[2]
    } else {
        message(paste0("Group1:", group1))
        message(paste0("Group2:", group2))
    }

    # Check if groups has at least one sample
    if (!any(colData(data)[, groupCol] == group1, na.rm = TRUE)) {
        stop(paste0("Sorry, but ", group1, " has no samples"))
    }
    if (!any(colData(data)[, groupCol] == group2, na.rm = TRUE)) {
        stop(paste0("Sorry, but ", group2, " has no samples"))
    }

    results <- dmc.non.parametric.se(
        data = data,
        groupCol = groupCol,
        group1 = group1,
        group2 = group2,
        paired = paired,
        adj.method = adj.method,
        alternative = alternative,
        cores = cores
    )

    # defining title and label if not specified by the user
    if (is.null(title)) {
        title <- paste("Volcano plot", "(", group1, "vs", group2, ")")
    }

    if (is.null(label)) {
        label <- c(
            "Not Significant",
            "Hypermethylated",
            "Hypomethylated"
        )
        label[2:3] <-  paste(label[2:3], "in", group1)
    }

    group1.col <- gsub("[[:punct:]]| ", ".", group1)
    group2.col <- gsub("[[:punct:]]| ", ".", group2)

    # get significant data
    results$status <- "Not Significant"
    sig <- which(results[, grep("p.value.adj", colnames(results))] < p.cut)
    hyper <- which(results[, grep("minus", colnames(results))] > diffmean.cut)
    hypo <- which(results[, grep("minus", colnames(results))] < -diffmean.cut)
    hyper.sig <- intersect(hyper, sig)
    hypo.sig <- intersect(hypo, sig)

    if (length(hyper.sig))
        results[hyper.sig, "status"] <- paste0("Hypermethylated in ", group1)

    if (length(hypo.sig))
        results[hypo.sig, "status"] <-  paste0("Hypomethylated in ", group1)

    # Plot a volcano plot
    names <- NULL
    if (probe.names)
        names <- rownames(results)

    if (!is.null(plot.filename)) {
        message("Saving volcano plot as: ", plot.filename)
        TCGAVisualize_volcano(
            x = results[, grep("minus", colnames(results))],
            y = results[, grep("p.value.adj", colnames(results))],
            filename = plot.filename,
            ylab =  ylab,
            xlab = xlab,
            title = title,
            legend = legend,
            label = label,
            names = names,
            x.cut = diffmean.cut,
            y.cut = p.cut
        )
    }

    if (save) {
        # saving results into a csv file
        csv <- paste0(
            paste(
                "DMR_results",
                gsub("_", ".", groupCol),
                group1.col,
                group2.col,
                "pcut",
                p.cut,
                "meancut",
                diffmean.cut,
                sep = "_"
            ),
            ".csv"
        )
        dir.create(save.directory,
                   showWarnings = FALSE,
                   recursive = TRUE)
        csv <- file.path(save.directory, csv)
        message(paste0("Saving the results also in a csv file: "), csv)
        write.csv(results, csv)
    }
    return(results)
}

#' @title Create starburst plot
#'
#' @description
#'   Create Starburst plot for comparison of DNA methylation and gene expression.
#'    The log10 (FDR-corrected P value) is plotted for beta value for DNA
#'    methylation (x axis) and gene expression (y axis) for each gene.
#'
#'    The black dashed line shows the FDR-adjusted P value of 0.01.
#'
#'    You can set names to TRUE to get the names of the significant genes.
#'
#'    Candidate biologically significant genes will be circled in the plot.
#'
#'    Candidate biologically significant are the genes that respect the
#'    expression (logFC.cut), DNA methylation (diffmean.cut) and
#'    significance thresholds (exp.p.cut, met.p.cut)
#'
#' @details
#'    Input: data with gene expression/methylation expression
#'    Output: starburst plot
#'
#' @param met A SummarizedExperiment with methylation data obtained from the
#' TCGAPrepare or Data frame from DMR_results file. Expected colData columns: diffmean,  p.value.adj  and p.value
#' Execute volcanoPlot function in order to obtain these values for the object.
#' @param exp Object obtained by DEArnaSEQ function
#' @param filename The filename of the file (it can be pdf, svg, png, etc)
#' @param met.platform DNA methylation platform ("27K","450K" or "EPIC")
#' @param genome Genome of reference ("hg38" or "hg19") used to identify nearest probes TSS
#' @param legend legend title
#' @param color vector of colors to be used in graph
#' @param label vector of labels to be used in graph
#' @param title main title
#' @param names Add the names of the significant genes? Default: FALSE
#' @param names.fill Names should be filled in a color box?  Default: TRUE
#' @param ylab y axis text
#' @param xlab x axis text
#' @param xlim x limits to cut image
#' @param ylim y limits to cut image
#' @param met.p.cut methylation p value cut-off
#' @param exp.p.cut expression p value cut-off
#' @param diffmean.cut If set, the probes with diffmean higher
#' than methylation cut-off will be
#'  highlighted in the plot. And the data frame return will be subseted.
#' @param logFC.cut If set, the probes with expression fold
#' change higher than methylation cut-off will be
#'  highlighted in the plot. And the data frame return will be subseted.
#' @param group1 The name of the group 1
#' Obs: Column p.value.adj.group1.group2 should exist
#' @param group2 The name of the group 2.
#' Obs: Column p.value.adj.group1.group2 should exist
#' @param return.plot If true only plot object will be returned (pdf will not be created)
#' @param height Figure height
#' @param width Figure width
#' @param dpi Figure dpi
#' @import ggplot2
#' @importFrom GenomicRanges distanceToNearest
#' @importFrom SummarizedExperiment rowRanges rowRanges<- values<-
#' @importFrom S4Vectors subjectHits queryHits
#' @export
#' @return Save a starburst plot
#' @examples
#' \dontrun{
#' library(SummarizedExperiment)
#' met <- TCGAbiolinks:::getMetPlatInfo(genome = "hg38", platform = "27K")
#' values(met) <- NULL
#' met$probeID <- names(met)
#' nrows <- length(met); ncols <- 20
#' counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows)
#' colData <- S4Vectors::DataFrame(
#'   Treatment = rep(c("ChIP", "Input"), 5),
#'   row.names = LETTERS[1:20],
#'   group = rep(c("group1","group2"),c(10,10))
#' )
#' met <- SummarizedExperiment::SummarizedExperiment(
#'          assays = S4Vectors::SimpleList(counts=counts),
#'          rowRanges = met,
#'          colData = colData)
#' rowRanges(met)$diffmean.g1.g2 <- c(runif(nrows, -0.1, 0.1))
#' rowRanges(met)$diffmean.g2.g1 <- -1*(rowRanges(met)$diffmean.g1.g2)
#' rowRanges(met)$p.value.g1.g2 <- c(runif(nrows, 0, 1))
#' rowRanges(met)$p.value.adj.g1.g2 <- c(runif(nrows, 0, 1))
#' exp <- TCGAbiolinks:::get.GRCh.bioMart("hg38")
#' exp$logFC <- runif(nrow(exp), -5, 5)
#' exp$FDR <- runif(nrow(exp), 0.01, 1)
#'
#' result <- TCGAvisualize_starburst(
#'   met,
#'   exp,
#'   exp.p.cut = 0.05,
#'   met.p.cut = 0.05,
#'   logFC.cut = 2,
#'   group1 = "g1",
#'   group2 = "g2",
#'   genome = "hg38",
#'   met.platform = "27K",
#'   diffmean.cut = 0.0,
#'   names  = TRUE
#' )
#' }
TCGAvisualize_starburst <- function(
    met,
    exp,
    group1 = NULL,
    group2 = NULL,
    exp.p.cut = 0.01,
    met.p.cut = 0.01,
    diffmean.cut = 0,
    logFC.cut = 0,
    met.platform,
    genome,
    names = FALSE,
    names.fill = TRUE,
    filename = "starburst.pdf",
    return.plot = FALSE,
    ylab = expression(atop("Gene Expression",
                           paste(Log[10],
                                 " (FDR corrected P values)"))),
    xlab = expression(atop("DNA Methylation",
                           paste(Log[10],
                                 " (FDR corrected P values)"))),
    title = "Starburst Plot",
    legend = "DNA Methylation/Expression Relation",
    color = NULL,
    label = c(
        "Not Significant",
        "Up regulated & Hypo methylated",
        "Down regulated & Hypo methylated",
        "hypo methylated",
        "hyper methylated",
        "Up regulated",
        "Down regulated",
        "Up regulated & Hyper methylated",
        "Down regulated & Hyper methylated"
    ),
    xlim = NULL,
    ylim = NULL,
    height = 10,
    width = 20,
    dpi = 600
) {
    if (missing(genome))
        stop("Please set genome (hg19 or hg38)")
    if (missing(met.platform))
        stop("Please set met.platform (EPIC, 450K or 27K)")

    .e <- environment()

    group1.col <- gsub("[[:punct:]]| ", ".", group1)
    group2.col <- gsub("[[:punct:]]| ", ".", group2)

    if (title == "Starburst Plot") {
        if (diffmean.cut != 0 & logFC.cut == 0) {
            title <- bquote(atop("Starburst Plot", scriptstyle((
                list(
                    Delta ~ bar(beta) >= .(diffmean.cut),
                    FDR[expression]  <=  .(exp.p.cut),
                    FDR[DNAmethylation]  <=  .(met.p.cut)
                )
            ))))
        } else if (logFC.cut != 0 & diffmean.cut == 0) {
            title <- bquote(atop("Starburst Plot", scriptstyle((
                list(
                    group("|", logFC, "|")  >=  .(logFC.cut),
                    FDR[expression]  <=  .(exp.p.cut),
                    FDR[DNAmethylation]  <=  .(met.p.cut)
                )
            ))))
        } else if (logFC.cut != 0 & diffmean.cut != 0) {
            title <- bquote(atop("Starburst Plot", scriptstyle((
                list(
                    Delta,
                    bar(beta)  >=  .(diffmean.cut),
                    group("|", logFC, "|")  >=  .(logFC.cut),
                    FDR[expression]  <=  .(exp.p.cut),
                    FDR[DNAmethylation]  <=  .(met.p.cut)
                )
            ))))
        }
    }

    if (is.null(color))
        color <- c(
            "#000000",
            "#E69F00",
            "#56B4E9",
            "#009E73",
            "red",
            "#0072B2",
            "#D55E00",
            "#CC79A7",
            "purple"
        )
    names(color) <- as.character(1:9)
    names(label) <- as.character(1:9)
    names.color <- color
    names(names.color) <- label

    if (is.null(group1) || is.null(group2)) {
        message("Please, set the group1 and group2 parameters")
        return(NULL)
    }

    if (class(met) == class(as(SummarizedExperiment(), "RangedSummarizedExperiment"))) {
        met <- SummarizedExperiment::values(met)
    }

    # Preparing methylation
    pcol <-
        gsub("[[:punct:]]| ",
             ".",
             paste("p.value.adj", group1, group2, sep = "."))
    if (!(pcol %in%  colnames(met))) {
        pcol <-
            gsub("[[:punct:]]| ",
                 ".",
                 paste("p.value.adj", group2, group1, sep = "."))
    }
    if (!(pcol %in%  colnames(met))) {
        stop("Error! p-values adjusted not found. Please, run TCGAanalyze_DMR")
    }

    # Transform factor coluns to characters
    fctr.cols <- sapply(exp, is.factor)
    exp[, fctr.cols] <- sapply(exp[, fctr.cols], as.character)
    # Methylation matrix and expression matrix should have the same name column for merge
    idx <- grep("ENSG", exp[1, ])
    if (length(idx) > 0) {
        colnames(exp)[idx] <- "ensembl_gene_id"
    } else {
        # it is in the row names ?
        if (grepl("ENSG", rownames(exp)[1])) {
            exp$ensembl_gene_id <- rownames(exp)
        } else {
            # We will consider our rownames has the gene symbol
            gene.info <- get.GRCh.bioMart(genome)
            if (any(sapply(rownames(exp)[1:10], function(y)
                any(grepl(
                    y, gene.info[, grep("external_gene_", colnames(gene.info))]
                ))))) {
                idx <-
                    match(rownames(exp), gene.info[, grep("external_gene_", colnames(gene.info))])
                exp <- exp[!is.na(idx), ]
                idx <-
                    match(rownames(exp), gene.info[, grep("external_gene_", colnames(gene.info))])
                exp$ensembl_gene_id <-
                    gene.info[idx, "ensembl_gene_id"]
            }
        }

    }

    if (!"probeID" %in% colnames(met))
        met$probeID <- met$Composite.Element.REF


    # Check if gene symbol columns exists
    if (!"ensembl_gene_id" %in% colnames(exp)) {
        stop("Column ensembl_gene_id was not found")
    }

    # Correlate gene expression with DNA methylation levels
    # Step 1: identify nearest TSS for each probe
    # Step 2: create a table with probe, gene, transcript, distance.TSS and merge with
    #         the met (by probe name) and gene expression (by gene name) results
    message("o Fetching auxiliary information")
    message("oo Fetching probes genomic information")
    met.info <- getMetPlatInfo(genome = genome, platform = met.platform)
    values(met.info) <- NULL
    message("oo Fetching TSS information")
    tss <- getTSS(genome = genome)
    tss <- promoters(tss, upstream = 0, downstream = 0)
    message("o Mapping probes to nearest TSS")
    dist <- distanceToNearest(tss, met.info)
    g <- suppressWarnings(as.data.frame(tss[queryHits(dist)]))
    g$start_position <- NULL
    g$end_position <- NULL
    colnames(g)[1:5] <- paste0("gene_", colnames(g)[1:5])
    m <- suppressWarnings(as.data.frame(met.info[subjectHits(dist)], row.names = NULL))
    colnames(m) <- paste0("probe_", colnames(m))
    m$probeID <- names(met.info[subjectHits(dist)])
    nearest <- cbind(m, g)
    nearest$distance_TSS <- values(dist)$distance
    # Keep only one entry
    nearest$id <- paste0(nearest$probeID, nearest$ensembl_gene_id)
    nearest <- nearest[order(nearest$id, -abs(nearest$distance_TSS)),]
    nearest <- nearest[!duplicated(nearest$id),]
    nearest$id <- NULL
    nearest <- nearest[, c("distance_TSS", "probeID", "ensembl_gene_id")]
    # END mapping nearest probe to nearest gene
    message("o Mapping results information")
    volcano <- plyr::join(nearest, exp, by = "ensembl_gene_id")
    volcano <- merge(volcano, met, by = "probeID")

    volcano$ID <- paste(volcano$ensembl_gene_id, volcano$probeID, sep = ".")
    volcano <- volcano[!is.na(volcano$FDR), ] # Some genes have no values
    # Preparing gene expression
    volcano$geFDR <- log10(volcano$FDR)
    volcano$geFDR2 <- volcano$geFDR
    volcano[volcano$logFC > 0, "geFDR2"] <-
        -1 * volcano[volcano$logFC > 0, "geFDR"]

    # Preparing DNA methylation
    diffcol <- gsub(
        "[[:punct:]]| ",
        ".",
        paste("diffmean", group1, group2, sep = ".")
    )
    volcano$meFDR <- log10(volcano[, pcol])
    volcano$meFDR2 <- volcano$meFDR
    idx <- volcano[, diffcol] > 0
    idx[is.na(idx)] <- FALSE # handling NAs
    volcano[idx, "meFDR2"] <- -1 * volcano[idx, "meFDR"]

    label[2:9] <-  paste(label[2:9], "in", group2)

    # subseting by regulation (geFDR) and methylation level
    # (meFDR) down regulated up regulated lowerthr
    # |||||||||||||||| upperthr hypomethylated hypermethylated
    met.lowerthr <- log10(met.p.cut)
    met.upperthr <- (-met.lowerthr)

    exp.lowerthr <- log10(exp.p.cut)
    exp.upperthr <- (-exp.lowerthr)
    volcano <- suppressWarnings(tibble::as.tibble(volcano))

    # Group 2:up regulated and hypomethylated
    a <- dplyr::filter(
        volcano,
        volcano$geFDR2 > exp.upperthr &
            volcano$meFDR2 < met.lowerthr &
            abs(volcano[, diffcol]) > diffmean.cut &
            abs(volcano$logFC) > logFC.cut
    )

    # Group 3: down regulated and hypomethylated
    b <- dplyr::filter(
        volcano,
        volcano$geFDR2 < exp.lowerthr &
            volcano$meFDR2 < met.lowerthr &
            abs(volcano[, diffcol]) > diffmean.cut &
            volcano$logFC < logFC.cut
    )

    # Group 4: hypomethylated
    c <- dplyr::filter(
        volcano,
        volcano$geFDR2 > exp.lowerthr &
            volcano$geFDR2 < exp.upperthr &
            volcano$meFDR2 < met.lowerthr &
            abs(volcano[, diffcol]) > diffmean.cut
    )

    # Group 5: hypermethylated
    d <- dplyr::filter(
        volcano,
        volcano$geFDR2 > exp.lowerthr &
            volcano$geFDR2 < exp.upperthr &
            volcano$meFDR2 > met.upperthr &
            abs(volcano[, diffcol]) > diffmean.cut
    )

    # Group 6: upregulated
    e <- dplyr::filter(
        volcano,
        volcano$geFDR2 > exp.upperthr &
            volcano$meFDR2 < met.upperthr &
            volcano$meFDR2 > met.lowerthr &
            volcano$logFC > logFC.cut
    )

    # Group 7: downregulated
    f <- dplyr::filter(
        volcano,
        volcano$geFDR2 < exp.lowerthr &
            volcano$meFDR2 < met.upperthr &
            volcano$meFDR2 > met.lowerthr &
            abs(volcano$logFC) > logFC.cut
    )

    # Group 8: upregulated and hypermethylated
    g <- dplyr::filter(
        volcano,
        volcano$geFDR2 > exp.upperthr &
            volcano$meFDR2 > met.upperthr &
            abs(volcano[, diffcol]) > diffmean.cut &
            volcano$logFC > logFC.cut
    )

    # Group 9: downregulated and hypermethylated
    h <- dplyr::filter(
        volcano,
        volcano$geFDR2 < exp.lowerthr &
            volcano$meFDR2 > met.upperthr &
            abs(volcano[, diffcol]) > diffmean.cut &
            volcano$logFC < logFC.cut
    )

    groups <- as.character(seq(2, 9))

    # return methylation < 0, expression > 0
    volcano$starburst.status  <-  "Not Significant"
    volcano$shape <- "1"
    volcano$threshold.starburst <- "1"
    volcano$threshold.size <- "1"

    state <- c(
        "Up regulated & Hypo methylated",
        # a
        "Down regulated & Hypo methylated",
        # b
        "hypo methylated",
        # c
        "hyper methylated",
        # d
        "Up regulated",
        # e
        "Down regulated",
        # f
        "Up regulated & Hyper methylated",
        # g
        "Down regulated & Hyper methylated"
    ) # h

    s <- list(a, b, c, d, e, f, g, h)
    for (i in seq_along(s)) {
        idx <- s[[i]]$ID
        if (length(idx) > 0) {
            volcano[volcano$ID %in% idx, "threshold.starburst"] <- groups[i]
            volcano[volcano$ID %in% idx, "starburst.status"] <-
                state[i]
        }
    }

    s <- list(a, b, g, h)
    significant <- NULL
    for (i in seq_along(s)) {
        idx <- s[[i]]$ID
        if (length(idx) > 0) {
            significant <-  rbind(significant, volcano[volcano$ID %in% idx, ])
        }
    }
    message("o Plotting figure")
    volcano.aux <- volcano # we need an auxiliry data if dats is returned
    ## starburst plot
    p <- ggplot(
        data = volcano.aux,
        environment = .e,
        aes(
            x = volcano.aux$meFDR2,
            y = volcano.aux$geFDR2,
            colour = volcano.aux$threshold.starburst
        )
    ) + geom_point()

    if (names == TRUE & !is.null(significant)) {
        check_package("ggrepel")
        message("oo Adding names to genes")
        if (names.fill) {
            p <- p + ggrepel::geom_label_repel(
                data = significant,
                aes(
                    x = significant$meFDR2,
                    y =  significant$geFDR2,
                    label = map.ensg(genome, significant$ensembl_gene_id)$external_gene_name,
                    fill = as.factor(significant$starburst.status)
                ),
                size = 4,
                show.legend = FALSE,
                fontface = 'bold',
                color = 'black',
                box.padding = unit(0.35, "lines"),
                point.padding = unit(0.3, "lines")
            ) + scale_fill_manual(values = names.color)
        }  else {
            p <- p + ggrepel::geom_text_repel(
                data = significant,
                aes(
                    x = significant$meFDR2,
                    y =  significant$geFDR2,
                    label = map.ensg(genome, significant$ensembl_gene_id)$external_gene_name,
                    fill = significant$starburst.status
                ),
                size = 4,
                show.legend = FALSE,
                fontface = 'bold',
                color = 'black',
                point.padding = unit(0.3, "lines"),
                box.padding = unit(0.5, 'lines')
            )
        }
    }


    if (!is.null(xlim)) {
        p <- p + xlim(xlim)
    }
    if (!is.null(ylim)) {
        p <- p + ylim(ylim)
    }
    p <- p + ggtitle(title) + ylab(ylab) + xlab(xlab) + guides(size = FALSE)
    p <- p + scale_color_manual(values = color,
                                labels = label,
                                name = legend) +
        guides(col = guide_legend(nrow = 3))

    p <-
        p + geom_hline(aes(yintercept = exp.lowerthr),
                       colour = "black",
                       linetype = "dashed") +
        geom_hline(aes(yintercept = exp.upperthr),
                   colour = "black",
                   linetype = "dashed") +
        geom_vline(aes(xintercept = met.lowerthr),
                   colour = "black",
                   linetype = "dashed") +
        geom_vline(aes(xintercept = met.upperthr),
                   colour = "black",
                   linetype = "dashed")  +
        theme_bw() +
        theme(
            panel.border = element_blank(),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            axis.line.x = element_line(colour = "black"),
            axis.line.y = element_line(colour = "black"),
            legend.position = "top",
            legend.key = element_rect(colour = 'white'),
            plot.title = element_text(
                face = "bold",
                size = 16,
                hjust = 0.5
            ),
            legend.text = element_text(size = 14),
            legend.title = element_text(size = 14),
            axis.text = element_text(size = 14),
            axis.title.x = element_text(face = "bold", size = 14),
            axis.text.x = element_text(vjust = 0.5,
                                       size = 14),
            axis.title.y = element_text(face = "bold",
                                        size = 14),
            axis.text.y = element_text(size = 14)
        )
    if (!return.plot)
        ggsave(
            filename = filename,
            width = width,
            height = height,
            dpi = dpi
        )
    volcano$shape <- NULL
    volcano$threshold.starburst <- NULL
    volcano$threshold.size <- NULL

    volcano <- dplyr::filter(
        volcano,
        volcano$geFDR <= exp.lowerthr &
            volcano$meFDR <= met.lowerthr
    )

    if (diffmean.cut != 0) {
        volcano <-
            dplyr::filter(volcano, abs(volcano[, diffcol]) > diffmean.cut)
    }
    if (logFC.cut != 0) {
        volcano <- dplyr::filter(volcano, abs(volcano$logFC) >= logFC.cut)
    }

    message("o Saving results")
    message("oo Saving significant results as: starburst_results.csv")
    message(
        "oo It contains pair with changes both in the expression level of the nearest gene and  in the DNA methylation level"
    )
    suppressWarnings(write_csv(x = volcano, path = "starburst_results.csv"))

    if (return.plot) {
        return(list(plot = p, starburst = volcano))
    }

    return(volcano)
}


#' @title Get DNA methylation array metadata from SesameData
#' @noRd
getMetPlatInfo <- function(
    genome = c("hg38","hg19"),
    platform = c("450k","EPIC","27k")
){
    genome <- match.arg(genome)
    # arrayType <- match.arg(arrayType)

    platform <-  switch(
        platform,
        "450K" = "HM450",
        "450k" = "HM450",
        "27k"  = "HM27",
        "27K"  = "HM27",
        "EPIC" = "EPIC"
    )
    if (is.null(platform)) {
        stop("platform must one of the following options: 450k, EPIC or 27k")
    }

    check_package("sesameData")
    check_package("sesame")

    sesameDataCacheAll()
    sesameDataGet(
        str_c(
            platform,
            ".",
            genome,
            ".manifest"
        )
    )
}

Try the TCGAbiolinks package in your browser

Any scripts or data that you put into this service are public.

TCGAbiolinks documentation built on Nov. 8, 2020, 5:37 p.m.