R/s3_DE_analysis_function.r

Defines functions vizualize_DE_genes_bp vizualize_DE_genes_HM convert2HGNC s3_DE_analysis

Documented in s3_DE_analysis

#' s3 differential gene analysis
#'
#' This function runs differential gene analysis (linear modeling on normalized counts)
#' @param targetfile target file.
#' @param folder2voomobjects folder where necessary voom R objects are stored (should be results folder from step 1) defualt is s1_norm_raw_counts_results
#' @param results_folder User can specify name of output folder they want results to go in (default is s3_DE_results/)
#' @param gene_conversion_file file with alternative gene names (we usually convert Rhesus Ensembl genes to HGNC) 
#' @param blocking_column column to account sampling from the same animal multiple times (needs to be the same as whatever was specified in step s1_normalize_raw_counts)
#' @param matrixfile text file outlining different contrasts to do in DE analysis 
#' @param pvalue parameter for determining significantly DE genes (default 0.05)
#' @param logfoldchange parameter for determining significantly DE genes (default 1.5)
#' @param reset_design_matrix parameter for user to reset design matrix if they want to do this differently than they did in step 1 (defualt is FALSE)
#' @param target_columns vector of columns from target file to use as treatment group, this needs to be set if reset_design_matrix is TRUE
#' @param batch_column column from target file to be used as batch effect if reseting design matrix
#' @keywords differential expression linear modeling
#' @export
#' @import edgeR
#' @import limma
#' @import gplots
#' @import ggplot2
#' @import data.table
#' @import stringr
#' @examples
#' s3_DE_analysis(countfile="./s1_norm_raw_counts_results/1.norm_matrix.txt", targetfile="./p1_modified_count_matrix_results/target_file.csv", gene_conversion_file="rhesus2human.csv", blocking_column=2, matrixfile="./MATRIX.txt")
 

s3_DE_analysis <- function(targetfile="./p1_modified_count_matrix_results/target_file.csv",
                           folder2voomobjects="s1_norm_raw_counts_results",
                           results_folder=FALSE,  gene_conversion_file=FALSE,
                           blocking_column=FALSE, matrixfile=FALSE, pvalue=0.05,
                           logfoldchange=1.5, reset_design_matrix=FALSE,
                           target_columns=FALSE, batch_column=FALSE) {
    if (typeof(results_folder) == "logical") {
        results_path <- generate_folder("s3_DE_results")
        unlink("./s3_DE_results/*")
    } else {
        results_path <- generate_folder(results_folder)
        unlink(paste0(results_folder, "/*"))
    }

    files <- loadfiles(count_file = FALSE, target_file = targetfile)

    if (file.exists(paste0(folder2voomobjects, "/1.voomobject.rds"))) {
        eset_voom <- readRDS(paste0(folder2voomobjects, "/1.voomobject.rds"))
    } else {
        print("WARNING: Could not find voom file...make sure you are in correct working directory, correctly pointing to where voom objects are stored or repeat step (s1_normalize_raw_counts)")
    }

    if (isTRUE(reset_design_matrix)) {
        eset_voom$design <- ""
        if (typeof(batch_column) == "logical") {
            counter <- 0
            treats <- list()
            if (length(target_columns) > 1) {
                for (target in target_columns) {
                    counter <- counter + 1
                    tr <- factor(files$targets[, target],
                        levels = unique(files$targets[, target]))
                    treats[[paste0("tr", counter)]] <- tr
                }
                if (length(target_columns) == 2) {
                    tr1 <- treats$tr1
                    tr2 <- treats$tr2
                    design <- model.matrix(~ 0 + tr1:tr2)
                } else if (length(target_columns) == 3) {
                    tr1 <- treats$tr1
                    tr2 <- treats$tr2
                    tr3 <- treats$tr3
                    design <- model.matrix(~ 0 + tr1:tr2:tr3)
                } else {
                    print("WARNING: not set up to work with more than 3 treatments")
                }
                rownames(design) <- colnames(eset_voom$E)
                colnames(design) <- make.names(colnames(design))
                design <- design[, colnames(design)[order(tolower(colnames(design[, ])))]]
                design <- design[, colSums(design) > 0]

            } else {
                tr <- factor(files$targets[, target_column],
                    levels = unique(files$targets[, target_column])
                )
                design <- model.matrix(~ 0 + tr)
                rownames(design) <- colnames(eset_voom$E)
                colnames(design) <- make.names(colnames(design))
            }
        } else {
            counter <- 0
            treats <- list()
            if (length(target_columns) > 1) {
                for (target in target_columns) {
                    counter <- counter + 1
                    tr <- factor(files$targets[, target],
                        levels = unique(files$targets[, target]))
                    treats[[paste0("tr", counter)]] <- tr
                }
                Xid <- factor(files$targets[, batch_column],
                    levels = unique(files$targets[, batch_column])
                )

                if (length(target_columns) == 2) {
                    tr1 <- treats$tr1
                    tr2 <- treats$tr2
                    design <- model.matrix(~ 0 + tr1:tr2 + batch)
                } else if (length(target_columns) == 3) {
                    tr1 <- treats$tr1
                    tr2 <- treats$tr2
                    tr3 <- treats$tr3
                    design <- model.matrix(~ 0 + tr1:tr2:tr3 + Xid)
                } else {
                    print ("WARNING: not set up to work with more than 3 treatments")
                }

                rownames(design) <- colnames(eset_voom$E)
                colnames(design) <- make.names(colnames(design))
                design <- design[, colnames(design)[order(tolower(colnames(design[, ])))]]
                design <- design[, colSums(design) > 0]
                excludeAll <- nonEstimable(design)
                if ("tr1" %in% excludeAll) {
                    return("interactions term non estimable")
                }
                design <- design[, !colnames(design) %in% excludeAll]
                if (!is.fullrank(design)) {
                    return("not full rank")
                }

            } else {

                tr <- factor(files$targets[, target_column],
                    levels = unique(files$targets[, target_column])
                )
                Xid <- factor(files$targets[, batch_column],
                    levels = unique(files$targets[, batch_column])
                )
                design <- model.matrix(~ 0 + tr + Xid)

                rownames(design) <- colnames(eset_voom$E)
                colnames(design) <- make.names(colnames(design))
                design <- design[, colnames(design)[order(tolower(colnames(design[, ])))]]
                design <- design[, colSums(design) > 0]
                excludeAll <- nonEstimable(design)
                if ("tr" %in% excludeAll) {
                    return("interactions term non estimable")
                }
                design <- design[, !colnames(design) %in% excludeAll]
                if (!is.fullrank(design)) {
                    return("not full rank")
                }

            }
        }
    } else {
        if (file.exists(paste0(folder2voomobjects, "/1.designobject.rds"))) {
            design <- readRDS(paste0(folder2voomobjects, "/1.designobject.rds"))
        } else {
            print ("WARNING: Could not find design file...make sure you are in correct working directory, correctly pointing to where voom objects are stored or repeat step (s1_normalize_raw_counts)")
        }
    }

    if (file.exists(paste0(folder2voomobjects, "/1.corfit.rds"))) {
        corfit <- readRDS(paste0(folder2voomobjects, "/1.corfit.rds"))
    } else {
        print ("WARNING: Could not find corfit file...not necessary for analysis but just warning for the user")
    }

    if (exists("design") == FALSE | exists("eset_voom") == FALSE) {
        print ("WARNING: could not find design and voom model... make sure user is in the correct working directory")
    } else {
        if (typeof(blocking_column) != "logical") {
            BLOCKID  <- factor(files$targets[, blocking_column],
                               levels = unique(files$targets[, blocking_column]))

            if (exists("corfit") == TRUE) {
                V.lmfit <- lmFit(eset_voom, design = design,
                             block = BLOCKID,
                             correlation = corfit$consensus)
            } else {
                V.lmfit <- lmFit(eset_voom, design = design, block = BLOCKID)
            }
        } else {
            if (exists("corfit") == TRUE) {
                V.lmfit <- lmFit(eset_voom, design = design, correlation = corfit$consensus)
            } else {
                V.lmfit <- lmFit(eset_voom, design = design)
            }
        }
        if (typeof(matrixfile) == "character") {
            print("STATUS: getting DE genes...")

            matrix_contrast <- scan(matrixfile,  character(), quote = "")
            contmatrix <- makeContrasts(contrasts = matrix_contrast,
                                         levels = design)

            fit <- contrasts.fit(V.lmfit, contrasts = as.matrix(contmatrix))
            ##############EBAYES PARAMETERS###########
            # robust: should the estimation of df prior and var prior be robustified for 
            ## outlier variance (defualt is FALSE, we set to TRUE)
            # trend: should an intensity be allowed for prior variance (defualt is that prior variance is consistent FALSE)
            # portion: portion of genes which are differentially expressed (defualt is 0.01)
            # stdev.coef.lim: lower and upper limits for standard deviation of LFC for differentially expressed genes 
            ## defualt is c(0.1, 4)
            # winsor.tail.p: gives the left and right tail proportions of x to winsor (only used when robust=TRUE)
            ## (defualt is c(0.5, 0.1))
            ##Winsorizing - the transformation of statistic by limiting extreme values in the statistical data to reduce 
            ## the effect of outliers. Set outliers to a specifc percentile - a 90% winsoriation would see all the data below the 5tj
            ## percentile set to the 5th percentile and data above the 95th percentile set to the 95th percentile
            fit <- eBayes(fit, robust = TRUE, trend = TRUE)
            results <- decideTests(fit, lfc = round(log2(logfoldchange), digits = 2),
                                   method = "separate",
                                   adjust.method = "BH", p.value = pvalue)
            ###############################################################

            #########################OUTPUT FILES##########################

            # OUTPUT ALL fit DATA
            write.fit(fit, file = file.path(results_path, "3.All_data.csv"),
                      digits = 3, method = "separate", adjust = "BH", sep = " ,")

            #OUTPUT ALL LFCS
            write.csv(fit$coefficients,
                        file = file.path(results_path, "3.All_LFC.csv"))

            #OUTPUT ALL LFCS AFTER TRANSLATING GENE NAMES
            convert2HGNC(
                gene_conversion_file,
                "3.All_LFC.csv",
                "3.All_LFC_HGNC.csv",
                results_path
            )

            # OUTPUT ALL PVALS
            write.csv(fit$p.value, file = file.path(results_path,
                                                      "3.All_Pvalues.csv"))

            # OUTPUT ALL PVALS AFTER TRANSLATING GENE NAMES
            convert2HGNC(
                gene_conversion_file,
                "3.All_Pvalues.csv",
                "3.All_Pvalues_HGNC.csv",
                results_path
            )
            # OUTPUT ALL ADJUSTED PVALS
            for (i in 1:ncol(fit$p.value)) {
                 fit$p.value[, i] <- p.adjust(fit$p.value[, i], method = "BH")
            }
            write.csv(fit$p.value, file = file.path(results_path, "3.All_Pvalues_adj.csv"), quote = F)


            # OUTPUT ALL PVALS AFTER TRANSLATING GENE NAMES
            convert2HGNC(
                gene_conversion_file,
                "3.All_Pvalues_adj.csv",
                "3.All_Pvalues_adj_HGNC.csv",
                results_path
            )
 
            # OUTPUT ALL t VALUES
            write.csv(fit$t, file = file.path(results_path, "3.All_tvalues.csv"))

            # OUTPUT ALL t VALUES AFTER GENE TRANSLATION
            convert2HGNC(
                gene_conversion_file,
                "3.All_tvalues.csv",
                "3.All_tvalues_HGNC.csv",
                results_path
            )
            ####################################################################
            if (typeof(gene_conversion_file) == "character") {

                rhesus2human <- read.csv(file = gene_conversion_file, header = TRUE,
                                        stringsAsFactors = FALSE)
            }
            ##################Pull out signifcantly expressed genes#############
            ##SIGNIFICANT LOGVALUES
            #Extract results of differential expression
            #LogFold change is the coefficients
            dataMatrix <- fit$coefficients
            sigMask <- dataMatrix * (results**2) # 1 if significant, 0 otherwise

            # filter for significant genes
            ExpressMatrixLFC <- subset(dataMatrix,
                                       rowSums(sigMask) != 0)
            sigMask <- subset(sigMask, rowSums(sigMask) != 0)

            write.csv(ExpressMatrixLFC, file = file.path(results_path,
                                        "3.Significant_separate_LFC.csv"))
            convert2HGNC(gene_conversion_file,
                        "3.Significant_separate_LFC.csv",
                        "3.Significant_separate_LFC_HGNC_AV.csv",
                         results_path)
            ##SIGNIFICANT T values
            #Extract results of differential expression
            #LogFold change is the coefficients
            dataMatrix <- fit$t
            sigMask <- dataMatrix * (results**2) # 1 if significant, 0 otherwise

            #filter for significant genes
            ExpressMatrixtvalue <- subset(dataMatrix,
                                          rowSums(sigMask) != 0)

            sigMask <- subset(sigMask, rowSums(sigMask) != 0)
            write.csv(ExpressMatrixtvalue,
                      file = file.path(results_path,
                             "3.Significant_separate_tvalues.csv"))

            convert2HGNC(gene_conversion_file,
                         "3.Significant_separate_tvalues.csv",
                         "3.Significant_separate_tvalues_HGNC_AV.csv",
                         results_path)
            ##SIGNIFICANT P values
            #Extract results of differential expression
            #LogFold change is the coefficients
            dataMatrix <- fit$p.value
            sigMask <- dataMatrix * (results**2) # 1 if significant, 0 otherwise
            #filter for significant genes
            ExpressMatrixPvalue <- subset(dataMatrix, rowSums(sigMask) != 0)
            sigMask <- subset(sigMask, rowSums(sigMask) != 0)
            write.csv(ExpressMatrixPvalue,
                      file = file.path(results_path,
                             "3.Significant_separate_Pvaluesadj.csv"))
            convert2HGNC(gene_conversion_file,
                         "3.Significant_separate_Pvaluesadj.csv",
                         "3.Significant_separate_Pvaluesadj_HGNC_AV.csv",
                         results_path)

            results_path2 <- generate_folder("s3_DE_results/enrichfiles")
            unlink("./s3_DE_results/enrichfiles/*")

            for (i in colnames(fit$coefficients)) {
                #filter for significant genes
                ExpressMatrixLFC1 <- subset(fit$coefficients[, i],
                                            results[, i] != 0)
                allgenes <- fit$coefficients[, i]
                if (typeof(gene_conversion_file) == "character") {
                    if (length(ExpressMatrixLFC1) > 0) {
                        sig_HGNC <- merge(rhesus2human, ExpressMatrixLFC1,
                                          by.x = "Gene.stable.ID",
                                          by.y = "row.names",
                                          all.X = T, all.Y = T)
                        sig_HGNC <- sig_HGNC[ , !(names(sig_HGNC) %in% c("Gene.stable.ID"))]
                        dimensions <- dim(sig_HGNC)
                        if  (dimensions[1] == 0) {
                            print (paste0("WARNING: no sucessful translations from Ensembl to HGNCs so using ensembl IDs for comparison ", i))
                            write.table(ExpressMatrixLFC1,
                                        file = file.path(results_path2,
                                                         paste0(i, "_sig.rnk")),
                                        row.names = FALSE, col.names = FALSE,
                                        sep = "\t", quote = FALSE)
                            write.table(ExpressMatrixLFC1,
                                        file = file.path(results_path2,
                                                paste0(i, "_sig4GSEA.rnk")),
                                        row.names = FALSE, col.names = FALSE,
                                        sep = "\t", quote = FALSE)
                        }
                        else {
                            sig_HGNC <- avereps(sig_HGNC,
                                                ID = sig_HGNC$HGNC.symbol)
                            write.table(sig_HGNC,
                                        file = file.path(results_path2,
                                                         paste0(i, "_sig.rnk")),
                                         row.names = FALSE, col.names = FALSE,
                                         sep = "\t", quote = FALSE)
                            sig_HGNCnodup <- sig_HGNC[!duplicated(sig_HGNC[,2]), ]
                            write.table(sig_HGNCnodup,
                                        file = file.path(results_path2,
                                                paste0(i, "_sig4GSEA.rnk")),
                                        row.names = FALSE, col.names = FALSE,
                                        sep = "\t", quote = FALSE)
                        }
                    } else {
                        print (paste0("WARNING: comparison ", i, " only has 0 significantly different gene therefore not generating enrichment file"))
                    }
                    all_HGNC <- merge(rhesus2human, allgenes,
                                      by.x = "Gene.stable.ID",
                                      by.y = "row.names",
                                      all.X = T, all.Y = T)
                    all_HGNC <- all_HGNC[ , !(names(all_HGNC) %in% c("Gene.stable.ID"))]
                    all_HGNC <- avereps(all_HGNC, ID = all_HGNC$HGNC.symbol)
                    ###This file may have duplicates
                    write.table(all_HGNC, file = file.path(results_path2,
                                                 paste0(i, "_all.rnk")),
                                row.names = FALSE, col.names = FALSE,
                                sep = "\t", quote = FALSE)
                    all_HGNCnodup <- all_HGNC[!duplicated(all_HGNC[,2]), ]
                    ###No Duplicates in this file
                    write.table(all_HGNCnodup, file = file.path(results_path2,
                                               paste0(i,"_all4GSEA.rnk")),
                               row.names = FALSE, col.names = FALSE,
                               sep = "\t", quote = FALSE)

                } else {
                    if (length(significantgenes) > 0) {
                        write.table(significantgenes,
                                    file = file.path(results_path2,
                                            paste0(i, "_sig.rnk")),
                                    col.names = FALSE, sep = "\t",
                                    quote = FALSE)
                    } else {
                        print(paste0("WARNING: comparison ", i, " only has 0 significantly different gene therefore not generating enrichment file"))                        
                    }
                    write.table(allgenes, file = file.path(results_path2,
                                                 paste0(i, "_all.rnk")),
                                col.names = FALSE, sep = "\t", quote = FALSE)
                }
            }
            new_colnames <- c()
            for (i in colnames(ExpressMatrixLFC)) {
                i <- str_remove_all(i, "tr")
                new_colnames <- c(new_colnames, i)
            }
            colnames(ExpressMatrixLFC) <- new_colnames

            if (dim(ExpressMatrixLFC)[1] > 0) {
                hm_results      <- vizualize_DE_genes_HM(ExpressMatrixLFC,
                                                        file.path(results_path,
                                                        "3.heatmap_djn.png"))
                global_modules  <- hm_results$modules
                write.csv(global_modules,
                        file = file.path(results_path, "3.modules.csv"))
                global_modulesM <- as.matrix(global_modules)
                GM_HGNC         <- merge(rhesus2human, global_modulesM,
                                        by.x = "Gene.stable.ID",
                                        by.y = "row.names",
                                        all.X = T, all.Y = T)
                write.csv(GM_HGNC, file = file.path(results_path,
                                                "3.modules_HGNC.csv"))

                clustermatrix  <- hm_results$clustermatrix
                #invert row order
                clustermatrix   <- clustermatrix[order(nrow(clustermatrix):1), ]
                write.csv(clustermatrix, file = file.path(results_path,
                                                        "3.Clustered_LFC.csv"),
                        quote = FALSE)

                colnames(results) <- new_colnames
                vizualize_DE_genes_bp(results, file.path(results_path,
                                            "3.barplot_NumDEgenes.png"))
            } else {
                print ("WARNING: NO SIGNIFICANT GENES SO NOT GENERATE FIGURES OR EXTRA FILES")
            }
        } else {
            print("WARNING: need to specify matrix file")
        }
    }
}

convert2HGNC <- function(gene_conversion_file, input_file,
                        output_file, results_path) {

    if (typeof(gene_conversion_file) == "character") {

        rhesus2human <- read.csv(file = gene_conversion_file, header = TRUE,
                                 stringsAsFactors = FALSE)
        DE_HGNC_LFC <- read.csv(file.path(results_path, input_file), header = T,
                                row.names = 1, check.names = FALSE, sep = ",")
        if (dim(DE_HGNC_LFC)[1] > 0) {
            DE_HGNC_LFC <- merge(rhesus2human, DE_HGNC_LFC,
                                by.x = "Gene.stable.ID", by.y = "row.names")
            DE_HGNC_LFC <- avereps(DE_HGNC_LFC, ID = DE_HGNC_LFC$HGNC.symbol)
            write.csv(DE_HGNC_LFC, file = file.path(results_path, output_file))
        } else {
            print("WARNING NO SIGNIFICANT GENES SO CAN'T GENERATE HGNC FILE")
        }
    } else {
        print("WARNING: need to specify conversion file to convert Ensembls to HGNCs")
    }
}

vizualize_DE_genes_HM <- function(data, plot_file) {
    print("STATUS: Generating heatmap of DE genes...")
    png(plot_file, width = 8, height = 10, units = "in", res = 300)
    global_modules <- heatmap.F.4(data, cutoff = 1, distmethod = "bicor",
                                  clustermethod = "ward.D2", clusterdim = "row")
    dev.off()
    return(global_modules)
}

vizualize_DE_genes_bp <- function(results, plot_file) {
    print("STATUS: Generating bar plot of number of DE genes...")
    results_t <- t(summary(results))
    results_t <- results_t[, -2]

    for (i in 1:(length(row.names(results_t)))) {
        results_t[i, 1] <- results_t[i, 1] * -1
    }

    DE <- as.data.frame(results_t)
    DE <- setnames(DE, old = c("Var1", "Var2", "Freq"),
                   new = c("Time_Point", "group", "DE_genes"))

    #Create plot
    ggplot(DE, aes(x = Time_Point, y = DE_genes, fill = group,
           label = DE$DE_genes)) +
           geom_bar(stat = "identity", position = "identity") +
    # geom_text(size = 5, position = position_stack(vjust = 0) )+
    scale_fill_manual(values = c("#9d9dff", "#ff4d4d")) +
    ylab("Number of Differentially Expressed Genes") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
    ggsave(plot_file, dpi = 300)
}
galelab/GaleGEAnalysis documentation built on May 18, 2020, 7:32 a.m.