#' @keywords internal
#' @importFrom dplyr %>%
"_PACKAGE"
# quiets concerns of R CMD check re: the .'s that appear in pipelines
# reference: https://github.com/jennybc/googlesheets/blob/master/R/googlesheets.R
if (getRversion() >= "2.15.1") {
utils::globalVariables(
base::c(".")
)
}
## To test if a suggested package is installed
.onAttach <- function(libname, pkgname) {
packageStartupMessage("Attaching txomics package")
}
#' retrieve metadata from NCBI SRA
#'
#' If you are using SRA accessions, it's possible to retrieve
#' sample_table or matadata based on the deposited information
#' retrieve metadata from NCBI SRA
#'
retrieve_metadata_sra <- function() {
# This function retrieve metadata from sequencing run
# Using NCBI SRA
# INCOMPLETE
}
#' Import Expression Data
#'
#' Function used to import transcripts count and abundance data
#'
#' @param path path to files, or vector of paths,
#' if \code{is_dir} is set to FALSE
#'
#' @param source quantification method used.
#' Currently supported: "salmon", "kallisto".
#' If method is unknown use "none".
#'
#' @param accession source of gene names identifier
#' supported formats:"vectorbase", "gene", "custom"
#' If using "gene", the trancript names will be used as genes;
#' If using "custom", a conversion table should be
#' provided in the gene_table parameter;
#' not supported yet: "ensembl", "ncbi", "custom"
#'
#' @param is_dir if \code{path} is a directory containing
#' subdirectories with the files,
#' if \code{is_dir} is set to FALSE, path should be
#' a vector of paths;
#' default = TRUE
#'
#' @param names .vector of names to be used to each sample; default = NULL
#'
#' @param gene_table a transcript to gene conversion table,
#' where the first column should be transcript name
#' and the second column should be gene name,
#' should only be used with accession = "custom"
#'
#' @return A \code{txi} object containing
#'
#' @examples
#' \dontrun{
#' # Example with path to folder input
#' imported_transcripts <- import_tx("data/salmon/quants/")
#' }
#'
#' @export
#'
import_tx <- function(path,
source = "salmon",
accession = "vectorbase",
is_dir = TRUE,
names = NULL,
gene_table = NULL) {
## Function used to import transcripts abundance data
if ( isTRUE(is_dir)) {
if (isTRUE(source == "salmon")) {
path_to_files <- c(
fs::path(fs::dir_ls(path), "quant.sf"),
fs::path(path, "quant.sf")
)
path_to_files <- path_to_files[fs::file_exists(path_to_files) == TRUE]
} else {
path_to_files <- fs::dir_ls(path)
path_to_files <- path_to_files[fs::is_dir(path_to_files)]
path_to_files <- fs::dir_ls(path_to_files)
}
} else {
path_to_files <- path
}
message("Using files:\n", paste(path_to_files, collapse = "\n"), "\n")
if (!is.null(names)) {
if (isTRUE(length(names) == length(path_to_files))) {
names(path_to_files) <- names
} else {
stop("Number of names (sample names) should be the same as files to read.")
}
} else {
names(path_to_files) <- path_to_files %>%
stringr::str_extract("/(.*)/") %>%
stringr::str_remove("/$") %>%
stringr::str_remove(".*/")
}
tx_accession <- readr::read_delim(
path_to_files[1], "\t",
escape_double = FALSE, trim_ws = TRUE
) %>%
dplyr::pull(1)
if (isTRUE(accession == "vectorbase")) {
gene_vectorbase <- tx_accession %>% stringr::str_remove("-R.*")
tx_to_gene <- dplyr::tibble(
"TXNAME" = tx_accession,
"GENEID" = gene_vectorbase
)
}
if (isTRUE(accession == "gene")) {
tx_to_gene <- dplyr::tibble(
"TXNAME" = tx_accession,
"GENEID" = tx_accession
)
}
if (isTRUE(accession == "custom")) {
tx_custom <- gene_table %>%
dplyr::pull(1)
gene_custom <- gene_table %>%
dplyr::pull(2)
tx_to_gene <- dplyr::tibble(
"TXNAME" = tx_custom,
"GENEID" = gene_custom
)
}
if (isTRUE(source == "none")) {
#col_names <- c("target_id","eff_length","est_counts","tpm")
tx <- tximport::tximport(
files = path_to_files,
type = source,
tx2gene = tx_to_gene#,
#txIdCol = col_names[1],
#abundanceCol = col_names[4],
#countsCol = col_names[3],
#lengthCol = col_names[2]
)
}else {
tx <- tximport::tximport(
files = path_to_files,
type = source,
tx2gene = tx_to_gene
)
}
tx
}
#' Infer Library Construction Type
#'
#' Retrieve optimal library preparation
#' protocol used for Salmon method of
#' mapping.
#'
#' @param path path to Salmon results files
#'
#' @return A \code{tibble} object
#'
#' @export
#'
# path <- "data/salmon/res_quants/"
salmon_libtype <- function(path) {
files <- list.files(path)
files %>%
purrr::map_df(
~{
lib_name <- .x
lib_type <- paste0(path, "/", lib_name, "/lib_format_counts.json") %>%
jsonlite::read_json() %>%
.$`expected_format`
dplyr::tibble(lib_name, lib_type)
},
path
)
}
#' Retrieve Mapping Rate
#'
#' Retrieve total number of fragments,
#' total mapped reads and
#' mapping rate from a Salmon mapped
#' sample.
#'
#' @param path path to Salmon results files
#'
#' @return A \code{tibble} object
#'
#' @export
#'
retrieve_mapping_rate <- function(path) {
libs <- list.files(path)
purrr::map_df(libs, ~{
salmon_log <- readr::read_lines(paste0(path, "/", .x, "/logs/salmon_quant.log"))
mapping_pct <- salmon_log %>%
stringr::str_extract("info.*Mapping rate = .*%") %>%
.[!is.na(.)] %>%
stringr::str_extract("(\\d)+.(\\d)+%")
mapped_reads <- salmon_log %>%
stringr::str_extract("info.*Counted.*total reads") %>%
.[!is.na(.)] %>%
stringr::str_extract("(\\d)+")
total_reads <- salmon_log %>%
stringr::str_extract("Observed.*total fragments") %>%
.[!is.na(.)] %>%
stringr::str_extract("(\\d)+")
dplyr::data_frame(sample = .x, mapped_reads, total_reads, mapping_pct) %>%
dplyr::distinct()
})
}
#' Filter Imported Transcripts
#'
#' Filter salmon quantification files
#' in a \code{txi} format
#' based on metadata column and levels
#'
#' @param tx txi object
#'
#' @param sample_table metadata data frame, containing description
#' of each sample and experimental design
#'
#' @param var_column variable to filter the input
#'
#' @param var_levels levels of var_column to filter the input
#'
#' @return A \code{txi} object
#'
#' @export
#'
# tx <- imported_transcripts;sample_table <- metadata_df;var_column <- "population";var_levels <- c("Botucatu2014", "Rio2013")
filter_tx <- function(tx, sample_table, var_column = NULL, var_levels = NULL) {
first_column <- colnames(sample_table)[1]
if (!is.null(var_levels)) {
extracted_cols <- sample_table %>%
dplyr::filter((!!as.name(var_column)) %in% var_levels) %>%
dplyr::select(1) %>%
dplyr::filter((!!as.name(first_column)) %in% colnames(tx$counts)) %>%
unlist()
} else {
extracted_cols <- sample_table %>%
dplyr::select(1) %>%
unlist()
}
tx$counts <- tx$counts[, extracted_cols]
tx$abundance <- tx$abundance[, extracted_cols]
tx$length <- tx$length[, extracted_cols]
tx
}
#' Perform Differential Expression Analysis
#'
#' Run DESeq2 pipeline for differentially expressed genes.
#'
#' @param tx imported transcripts object containing reads counts
#'
#' @param sample_table metadata data frame, containing description
#' of each sample and experimental design
#'
#' @param contrast_var column name from the sample_table
#'
#' @param numerator factor from contrast_var to be considered as treatment
#'
#' @param denominator factor from contrast_var to be considered as control
#'
#' @param batch_var column from sample_table to be considered for batch effect
#'
#' @param beta_prior default = TRUE, set to FALSE to use apeglm method
#'
#' @param force_rep default = FALSE, experimental
#'
#' @param parallel Use all available cores to run.
#' Uses significantly more RAM. Default = FALSE.
#'
#' @return A \code{tibble} object
#'
#' @importFrom stats as.formula
#'
#' @export
#'
# tx <- tx_rio_2013; sample_table = metadata_df = samples_metadata
# contrast_var <- "condition"; numerator <- "resistant"; denominator <- "susceptible"
de_analysis <- function(tx, sample_table, contrast_var,
numerator, denominator,
batch_var = NULL,
beta_prior = TRUE,
force_rep = FALSE,
parallel = FALSE) {
lib_names <- colnames(tx$counts)
var_column <- colnames(sample_table[1])
sample_table <- sample_table %>%
dplyr::filter(!!as.name(var_column) %in% lib_names)
if (is.null(batch_var)) {
design_formula <- as.formula(paste0("~", contrast_var))
} else {
design_formula <- as.formula(paste0("~", batch_var, " + ", contrast_var))
}
dds1 <- DESeq2::DESeqDataSetFromTximport(tx,
sample_table,
design = design_formula
)
number_of_numerator_samples <- sample_table %>%
dplyr::filter(!!as.name(contrast_var) == numerator) %>%
base::nrow()
number_of_denominator_samples <- sample_table %>%
dplyr::filter(!!as.name(contrast_var) == denominator) %>%
base::nrow()
sample_replicates <- TRUE
if (number_of_numerator_samples < 2) {
sample_replicates <- FALSE
}
if (number_of_denominator_samples < 2) {
sample_replicates <- FALSE
}
## START Experimental
if (isTRUE(force_rep)) {
dds <- DESeq2::DESeq(dds1, betaPrior = TRUE)
res <- DESeq2::results(dds,
alpha = 0.05,
parallel = parallel,
contrast = c(contrast_var, numerator, denominator)
)
if (!isTRUE("data.frame" %in% class(res))) {
res <- res %>%
base::as.data.frame() %>%
dplyr::as_tibble(rownames = "gene")
}
return(res)
}
## END Experimental
if (isTRUE(sample_replicates)) {
if (isTRUE(beta_prior)) {
dds <- DESeq2::DESeq(dds1,
parallel = parallel,
betaPrior = TRUE
)
res <- DESeq2::results(dds,
alpha = 0.05,
parallel = parallel,
contrast = c(contrast_var, numerator, denominator)
)
} else {
dds <- DESeq2::DESeq(dds1,
parallel = parallel,
betaPrior = FALSE)
res <- DESeq2::lfcShrink(dds,
parallel = TRUE,
coef = paste0(
contrast_var, "_",
numerator,
"_vs_", denominator
),
parallel = parallel,
type = "apeglm"
)
}
} else {
numerator_name <- sample_table %>%
dplyr::filter(!!as.name(contrast_var) == numerator)
numerator_name <- as.character(numerator_name[, 1])
denominator_name <- sample_table %>%
dplyr::filter(!!as.name(contrast_var) == denominator)
denominator_name <- as.character(denominator_name[, 1])
res <- tx$abundance %>%
dplyr::as_tibble(rownames = "gene") %>%
dplyr::mutate(`FC` = !!as.name(numerator_name) /
!!as.name(denominator_name)) %>%
dplyr::arrange(-`FC`) %>%
dplyr::mutate(`log2FoldChange` = log2(`FC`)) %>%
dplyr::mutate(
log2FoldChange = dplyr::if_else(
is.finite(`log2FoldChange`), `log2FoldChange`, NA_real_
)
) %>%
dplyr::mutate(pvalue = NA_real_) %>%
dplyr::mutate(padj = NA_real_) %>%
dplyr::select(-(!!numerator_name)) %>%
dplyr::select(-(!!denominator_name)) %>%
dplyr::select(-`FC`) %>%
dplyr::arrange(`gene`)
}
if (!isTRUE("data.frame" %in% class(res))) {
res <- res %>%
base::as.data.frame() %>%
dplyr::as_tibble(rownames = "gene")
}
res
}
#' Perform Gene Set Enrichment Analysis
#'
#' Perform GSEA analysis.
#'
#' @param de_res results from de_analysis function
#'
#' @param gene_sets tidy data frame of the gene sets or
#' named list of genes named by gene set
#'
#' @param fdr should fdr be used, default = NULL.
#' If fdr = NULL, uses only log2FoldChange to rank.
#'
#'
#' @param file_ext File extension where the gene
#' sets should be loaded GMT is the default for GSEA
#' rds to load rds saved files. DEPRECATED
#'
#' @return A \code{tibble} containing GSEA results
#'
#' @export
#'
# de_res <- de_res_rio2013
# gene_sets <- aaegdata::kegg_gene_sets
gsea_analysis <- function(de_res,
gene_sets,
fdr = NULL,
file_ext = "gmt") {
temp_res <- de_res %>%
dplyr::mutate(
log2FoldChange = dplyr::if_else(is.na(log2FoldChange), 0, log2FoldChange)
) %>%
dplyr::mutate(pvalue = dplyr::if_else(is.na(pvalue), 1, pvalue)) %>%
dplyr::mutate(padj = dplyr::if_else(is.na(padj), 1, padj))
if (isTRUE(fdr)){
sorted_res <- temp_res %>%
dplyr::mutate(score = (log2FoldChange * (-log10(padj)))) %>%
dplyr::select(gene, log2FoldChange, padj, score) %>%
dplyr::arrange(score)
} else if (is.null(fdr)) {
sorted_res <- temp_res %>%
dplyr::mutate(score = log2FoldChange ) %>%
dplyr::select(gene, log2FoldChange, score) %>%
dplyr::arrange(score)
} else {
sorted_res <- temp_res %>%
dplyr::mutate(score = (log2FoldChange * (-log10(pvalue)))) %>%
dplyr::select(gene, log2FoldChange, pvalue, score) %>%
dplyr::arrange(score)
}
# ordered from strongest down-regulated to strongest upregulated
ranked_genes <- sorted_res$score
names(ranked_genes) <- sorted_res$gene
if (isTRUE("data.frame" %in% class(de_res))) {
gene_set_col <- base::colnames(gene_sets)[1]
temp_list <- gene_sets %>%
dplyr::group_by(!!as.name(gene_set_col)) %>%
dplyr::summarise(gene = list(gene)) %>%
dplyr::ungroup()
gene_sets_list <- temp_list$gene
names(gene_sets_list) <- temp_list %>%
base::as.data.frame() %>%
.[, gene_set_col]
gene_sets <- gene_sets_list
if (!is.list(gene_sets)) {
if (file_ext == "gmt") {
gene_sets <- fgsea::gmtPathways(gene_sets)
} else {
if (file_ext == "rds") {
gene_sets <- readr::read_rds(gene_sets)
}
}
}
}
gene_sets <- purrr::map(gene_sets, unique)
gsea_res <- fgsea::fgsea(
pathways = gene_sets,
stats = ranked_genes,
nperm = 100000
)
gsea_res <- gsea_res %>%
dplyr::as_tibble() %>%
dplyr::rename(pvalue = pval) %>%
dplyr::rename(leading_edge = leadingEdge)
gsea_res
}
#' Perform Leading Edge Analysis
#'
#' Extract Leading Edge genes from GSEA results.
#'
#' @param gsea_res result from gsea_analysis
#'
#' @param set gene set to be extracted
#'
#' @return A \code{vector} of leading edge genes for the set
#'
#' @export
#'
le_analysis <- function(gsea_res, set) {
gsea_res %>%
tidyr::unnest() %>%
dplyr::filter(pathway == set) %>%
dplyr::pull(leading_edge)
}
#' Retrieve Differentially Expressed Genes
#'
#' Extract differentially expressed genes from
#' DESeq2 results based on p-value cuttof.
#'
#' @param de_res de_analysis result
#'
#' @param pvalue p-value cut-off for enrichment or differential consideration,
#' default = 0.05
#
#' @param fdr logical, default = FALSE should False discovery rate adjusted
#' pvalue be used instead of pvalue
#'
#' @return a \code{vector} of gene names
#'
#' @export
#'
retrieve_deg <- function(de_res, pvalue = 0.05, fdr = FALSE) {
alpha <- pvalue
if (!isTRUE("data.frame" %in% class(de_res))) {
de_res <- de_res %>%
base::as.data.frame() %>%
dplyr::as_tibble(rownames = "gene")
}
if (isTRUE(fdr)) {
deg_genes <- de_res %>%
dplyr::filter(padj <= alpha) %>%
dplyr::pull(gene)
} else {
deg_genes <- de_res %>%
dplyr::filter(pvalue <= alpha) %>%
dplyr::pull(gene)
}
deg_genes
}
###############################################
## #
## Plot Functions #
## #
###############################################
#' Plot a Heatmap From Expression Data
#'
#' Plot a heatmap from expression data
#'
#' @param tx txi object, matrix or data frame with trancript abundance
#'
#' @param sample_table metadata data frame, containing description
#' of each sample and experimental design
#'
#' @param color_by variable from sample table to group samples in the plot
#'
#' @param num number of genes to plot, filtering by variance between samples,
#' if num is negative, it is used the absolute num of genes with
#' smaller, if num is NULL all genes are used.
#' default = NULL.
#'
#' @param scale logical, default = "row", normalize data by row, column or none
#'
#' @param show_rownames logical, default = FALSE
#'
#' @param ... additional plot parameters, check pheatmap documentation for
#' details
#'
#' @export
#'
plot_heatmap <- function(tx, sample_table, color_by = NULL,
num = NULL, scale = "row",
show_rownames = FALSE, ...) {
expression_matrix <- txomics::prepare_tx_mat(tx)
if (is.null(num)) {
num <- nrow(expression_matrix)
}
mat <- expression_matrix[base::rowSums(expression_matrix) > 1, ]
## Scaling matrix
if (isTRUE(scale == "row")) {
mat <- base::t(base::scale(base::t(mat), center = TRUE, scale = TRUE))
} else {
if (isTRUE(scale == "column")) {
mat <- base::scale(mat, center = TRUE, scale = TRUE)
}
}
filtered_mat <- mat %>%
dplyr::as_data_frame(rownames = "gene") %>%
dplyr::mutate(row_var = txomics::calc_var_by_row(.[2:length(.)])) %>%
dplyr::arrange(-row_var)
if (isTRUE(num > 0)) {
filtered_mat <- filtered_mat %>%
utils::head(num)
}
if (isTRUE(num < 0)) {
num <- abs(num)
filtered_mat <- filtered_mat %>%
utils::tail(num)
}
if (isTRUE(num == 0)) {
stop("Number of genes can't be zero.")
}
filtered_mat <- filtered_mat %>%
dplyr::select(-row_var)
mat <- filtered_mat %>%
dplyr::select(-gene) %>%
as.matrix()
rownames(mat) <- filtered_mat$gene
if (is.null(color_by)) {
annotation_df <- NA
annotation_pallete <- NA
} else {
annotation_df <- sample_table %>%
dplyr::select(1, !!color_by)
row_names <- annotation_df %>% dplyr::pull(1)
annotation_df <- annotation_df %>%
dplyr::select(-1) %>%
base::as.data.frame()
base::rownames(annotation_df) <- row_names
## Generate Annotation discrete color pallette
df_names <- colnames(annotation_df)
df_lengths <- df_names %>%
purrr::map_dbl(~length(unique(annotation_df[[.x]])))
annotation_pallete <- purrr::map(
seq_along(df_lengths),
~{
discrete_pallete <- df_lengths[.x] %>%
base::sum() %>%
viridis::viridis()
discrete_pallete
}
)
names(annotation_pallete) <- df_names
for (i in colnames(annotation_df)) {
names(annotation_pallete[[i]]) <- unique(annotation_df[, i])
}
}
## Sorting dendograms
sort_hclust <- function(...) {
stats::as.hclust(dendsort::dendsort(stats::as.dendrogram(...)))
}
mat_cluster_cols <- stats::hclust(stats::dist(t(mat)))
mat_cluster_cols <- sort_hclust(mat_cluster_cols)
mat_cluster_rows <- sort_hclust(stats::hclust(stats::dist(mat)))
utils::assignInNamespace(
x = "draw_colnames",
value = "draw_colnames_45",
ns = asNamespace("pheatmap")
)
if (!isTRUE(scale == "none")) {
color_function <- function() {
grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(
n = 7, name =
"PRGn"
)))(100)
}
} else {
color_function <- function() {
viridis::viridis(100, direction = 1)
}
}
breaks_value <- NA
if (isTRUE(scale == "row")) {
breaks_value <- base::seq(-3, 3, length.out = 101)
}
pheatmap::pheatmap(
mat = mat,
color = color_function(),
breaks = breaks_value,
annotation_col = annotation_df,
annotation_colors = annotation_pallete,
show_rownames = show_rownames,
border_color = NA,
treeheight_col = 30,
treeheight_row = 30,
drop_levels = TRUE,
cluster_cols = mat_cluster_cols,
cluster_rows = mat_cluster_rows,
...
)
}
#' Plot Flat Three Dimensional PCA
#'
#' Plot the results of a pseudo/flat tri-dimensional
#' principal component analysis, using 2 two dimensional plots.
#'
#' @param tx txi object, matrix or data frame with trancript abundance
#'
#' @param sample_table metadata data frame, containing description
#' of each sample and experimental design
#'
#' @param color_by variable from sample table to group samples in the plot
#'
#' @param num number of genes to plot, filtering by variance between samples,
#' if num is negative, it is used the absolute num of genes with
#' smaller, if num is NULL all genes are used.
#' default = NULL.
#'
#' @param label print labels for each sample. Default = FALSE
#'
#' @importFrom stats prcomp
#'
#' @importFrom utils head
#'
#' @export
#'
plot_pca <- function(tx,
sample_table,
color_by = NULL,
num = NULL,
label = FALSE) {
expression_matrix <- txomics::prepare_tx_mat(tx)
if (is.null(num)) {
num <- nrow(expression_matrix)
}
mat <- expression_matrix[rowSums(expression_matrix) > 1, ]
## Scaling matrix by row
mat <- base::t(base::scale(base::t(mat), center = TRUE, scale = TRUE))
filtered_mat <- mat %>%
dplyr::tibble(rownames = "gene") %>%
dplyr::mutate(row_var = txomics::calc_var_by_row(.[2:length(.)])) %>%
dplyr::arrange(-row_var)
if (isTRUE(num > 0)) {
filtered_mat <- filtered_mat %>%
utils::head(num)
}
if (isTRUE(num < 0)) {
num <- abs(num)
filtered_mat <- filtered_mat %>%
utils::tail(num)
}
if (isTRUE(num == 0)) {
stop("Number of genes can't be zero.")
}
filtered_mat <- filtered_mat %>%
dplyr::select(-row_var)
mat <- filtered_mat %>%
dplyr::select(-gene) %>%
base::as.matrix()
base::rownames(mat) <- filtered_mat$gene
pca <- mat %>%
base::t() %>%
stats::prcomp()
var_percent <- pca$sdev^2 / base::sum(pca$sdev^2)
pc_1 <- pca$x[, 1]
pc_1_df <- dplyr::tibble(
sample = names(pc_1),
PC1 = pc_1
)
pc_2 <- pca$x[, 2]
pc_2_df <- dplyr::tibble(
sample = names(pc_2),
PC2 = pc_2
)
pc_3 <- pca$x[, 3]
pc_3_df <- dplyr::tibble(
sample = names(pc_3),
PC3 = pc_3
)
if (is.null(color_by)) {
## plot without colors
df <- pc_1_df %>%
dplyr::left_join(pc_2_df, by = "sample") %>%
dplyr::left_join(pc_3_df, by = "sample") %>%
tidyr::gather(pcomp, value, -c(sample, PC1))
breaks_x <- base::pretty(df$value)
breaks_y <- base::pretty(df$PC1)
plot_obj <- df %>%
ggplot2::ggplot(ggplot2::aes(x = value, y = PC1, color = sample)) +
ggplot2::geom_point(size = 4) +
ggplot2::xlab(paste0(
"PC2: ", round(var_percent[2] * 100), "% variance",
"\t PC3: ", round(var_percent[3] * 100),
"% variance"
)) +
ggplot2::ylab(paste0(
"PC1: ", round(var_percent[1] * 100),
"% variance"
)) +
ggplot2::facet_grid(cols = ggplot2::vars(pcomp), scales = "free") +
ggplot2::scale_color_viridis_d(direction = 1) +
ggplot2::annotate("segment",
x = -Inf, xend = -Inf,
y = -Inf, yend = Inf, color = "black"
) +
ggplot2::annotate("segment",
x = -Inf, xend = -Inf,
y = -Inf, yend = Inf, color = "black"
) +
ggpubr::theme_pubr() +
ggplot2::theme(
strip.background = ggplot2::element_blank(),
strip.text.x = ggplot2::element_blank()
) +
ggplot2::scale_x_continuous(
breaks = breaks_x,
limits = base::range(breaks_x)
) +
ggplot2::scale_y_continuous(
breaks = breaks_y,
limits = base::range(breaks_y)
)
if (isTRUE(label)) {
plot_obj <- plot_obj +
ggrepel::geom_text_repel(ggplot2::aes(label = sample))
}
return(plot_obj)
}
if (!isTRUE(all(color_by %in% names(sample_table)))) {
stop("the values of 'color_by' should specify columns of sample_table metadata")
}
group_df <- sample_table %>%
dplyr::select(sample, !!color_by)
df <- pc_1_df %>%
dplyr::left_join(pc_2_df, by = "sample") %>%
dplyr::left_join(pc_3_df, by = "sample") %>%
tidyr::gather(pcomp, value, -c(sample, PC1)) %>%
dplyr::left_join(group_df, by = "sample") %>%
tidyr::unite(group, !!color_by)
breaks_x <- base::pretty(df$value)
breaks_y <- base::pretty(df$PC1)
plot_obj <- df %>%
ggplot2::ggplot(ggplot2::aes(x = value, y = PC1,
color = group)) +
ggplot2::geom_point(size = 4) +
ggplot2::xlab(paste0(
"PC2: ", round(var_percent[2] * 100), "% variance",
"\t\t\t\t PC3: ", round(var_percent[3] * 100), "% variance"
)) +
ggplot2::ylab(paste0(
"PC1: ", round(var_percent[1] * 100),
"% variance"
)) +
ggplot2::facet_grid(cols = ggplot2::vars(pcomp), scales = "free") +
ggplot2::scale_color_viridis_d(direction = 1) +
ggplot2::annotate("segment",
x = -Inf, xend = -Inf,
y = -Inf, yend = Inf, color = "black"
) +
ggplot2::annotate("segment",
x = -Inf, xend = -Inf,
y = -Inf, yend = Inf, color = "black"
) +
ggpubr::theme_pubr() +
# ggplot2::coord_fixed() +
ggplot2::theme(
strip.background = ggplot2::element_blank(),
strip.text.x = ggplot2::element_blank()
) +
ggplot2::scale_x_continuous(breaks = breaks_x, limits = base::range(breaks_x)) +
ggplot2::scale_y_continuous(breaks = breaks_y, limits = base::range(breaks_y))
# ggplot2::xlab(paste0("PC2: ", round(var_percent[2] * 100), "% variance")) +
# ggplot2::ylab(paste0("PC1: ", round(var_percent[1] * 100), "% variance")) +
# ggplot2::scale_color_viridis_d( direction = 1 ) +
# ggpubr::theme_pubr()
if (isTRUE(label)) {
plot_obj <- plot_obj +
ggrepel::geom_text_repel(ggplot2::aes(label = sample))
}
plot_obj
}
#' Volcano Plot
#'
#' A volcano plot from DE genes analysis results.
#'
#' @param de_res de_analysis results
#'
#' @param pvalue p-value cut-off for enrichment or differential consideration,
#' default = 0.05
#'
#' @param lfc_threshold absolute value of log2FoldChange to limit the significance
#'
#' @param fdr logical, default = FALSE should False discovery rate adjusted
#' pvalue be used instead of pvalue
#'
#' @param max maximum number of names of DEGs to be plotted. Default = 30.
#'
#' @return A \code{ggplot} object
#'
#' @export
#'
plot_volcano <- function(de_res,
pvalue = 0.05,
lfc_threshold = NULL,
fdr = FALSE,
max = 30) {
# scatter_plot <- function(data, x, y) {
# x <- enquo(x)
# y <- enquo(y)
#
# ggplot(data) + geom_point(aes(!!x, !!y))
# }
# scatter_plot(mtcars, disp, drat)
alpha <- pvalue
pvalue_var <- dplyr::quo(pvalue)
if (isTRUE(fdr)) {
pvalue_var <- dplyr::quo(padj)
}
df <- de_res %>%
dplyr::select(gene, log2FoldChange, pvalue, padj)
if (isTRUE(fdr)) {
df <- df %>%
dplyr::filter(!is.na(padj)) %>%
dplyr::mutate(sig_alpha = dplyr::if_else(padj < alpha, 1, 0))
} else {
df <- df %>%
dplyr::filter(!is.na(pvalue)) %>%
dplyr::mutate(sig_alpha = dplyr::if_else(pvalue < alpha, 1, 0))
}
if (is.null(lfc_threshold)) {
if (isTRUE(base::nrow(dplyr::filter(df, sig_alpha == 1)) == 0)) {
lfc_cutoff <- 2
} else {
lfc_cutoff <- df %>%
dplyr::filter(sig_alpha == 1) %>%
dplyr::pull(log2FoldChange) %>%
abs() %>%
max()
}
} else {
lfc_cutoff <- lfc_threshold
}
df <- df %>%
dplyr::mutate(sig_lfc = dplyr::if_else(
abs(log2FoldChange) >= lfc_cutoff, 2, 0
))
df <- df %>%
dplyr::mutate(sig = as.character(sig_alpha + sig_lfc)) %>%
dplyr::select(-c(sig_alpha, sig_lfc)) %>%
dplyr::filter(!is.na(log2FoldChange))
pvalue_line <- alpha
if (isTRUE(fdr)) {
pvalue_line <- df %>%
dplyr::filter(padj < alpha)
if (isTRUE(nrow(pvalue_line) == 0)) {
pvalue_line <- df %>%
dplyr::pull(pvalue) %>%
base::min()
} else {
pvalue_line <- pvalue_line %>%
dplyr::pull(pvalue) %>%
base::max()
}
}
if (isTRUE(max(abs(df$log2FoldChange)) <= abs(lfc_cutoff))) {
breaks_x <- base::pretty(base::range(-lfc_cutoff, lfc_cutoff))
} else {
breaks_x <- base::pretty(df$log2FoldChange)
}
if (isTRUE(max(-log10(df$pvalue)) <= 4)) {
range_y <- c(0, 4)
} else {
range_y <- -log10(df$pvalue)
}
breaks_y <- base::pretty(range_y)
pvalue_cutoff <- pvalue
sig_text <- c(
paste0(
as.character(pvalue_var)[2],
" > ", as.character(pvalue_cutoff)
),
paste0(
as.character(pvalue_var)[2],
" <= ", as.character(pvalue_cutoff)
),
paste0(
"abs(log2FC) > ",
as.character(lfc_cutoff)
),
paste0(
as.character(pvalue_var)[2], " <= ",
as.character(pvalue_cutoff), " & ",
"abs(log2FC) >= ", as.character(lfc_cutoff)
)
)
df <- df %>%
dplyr::mutate(sig = dplyr::if_else(
sig == "0", sig_text[1], sig
)) %>%
dplyr::mutate(sig = dplyr::if_else(
sig == "1", sig_text[2], sig
)) %>%
dplyr::mutate(sig = dplyr::if_else(
sig == "2", sig_text[3], sig
)) %>%
dplyr::mutate(sig = dplyr::if_else(
sig == "3", sig_text[4], sig
))
df$sig <- factor(df$sig, levels = sig_text)
color_pallete <- viridis::viridis(4)
names(color_pallete) <- sig_text
ggrepel_df <- df %>%
dplyr::filter(sig %in% sig_text[4]) %>%
dplyr::mutate(temp_var = (abs(log2FoldChange)*(-log10(pvalue)))) %>%
dplyr::arrange(-temp_var) %>%
utils::head(n = max)
p <- df %>%
ggplot2::ggplot(ggplot2::aes(log2FoldChange, -log10(pvalue))) +
ggplot2::geom_hline(yintercept = -log10(pvalue_line), alpha = 0.5) +
ggplot2::geom_vline(xintercept = lfc_cutoff, alpha = 0.5) +
ggplot2::geom_vline(xintercept = -lfc_cutoff, alpha = 0.5) +
ggplot2::geom_point(ggplot2::aes(color = sig)) +
ggplot2::scale_color_manual(
values = color_pallete
) +
ggrepel::geom_text_repel(
data = ggrepel_df,
ggplot2::aes(label = gene)
) +
ggplot2::scale_x_continuous(
breaks = breaks_x, limits = base::range(breaks_x * 1.1)
) +
ggplot2::scale_y_continuous(
breaks = breaks_y, limits = base::range(breaks_y)
) +
ggpubr::theme_pubr() +
ggplot2::xlab(~Log[2] ~ "(FC)") +
ggplot2::ylab(~-Log[10] ~ "(p-value)") +
ggplot2::labs(color = " ")
p
}
#' Plot GSEA Enrichment Curve and Table
#'
#' Plots enrichment table and individual enrichment graph for
#' GSEA analysis results
#'
#' @param gsea_res result from gsea_analysis
#'
#' @param de_res results from de_analysis function
#'
#' @param gene_sets named list of genes named by gene set
#'
#' @param top_sets Number of sets to be plotted in the gsea table,
#' the number of sets to be plotted is the double of the value
#' of top_sets
#' or a string for individual gene set for enrichment plot
#'
#' @param enrichment If set to TRUE, enrichment for individual
#' gene set will be plotted.
#' The default value is FALSE, for plotting GSEA table.
#'
#' @param title title label to be shown for enchment plot,
#' should be a character vector of the text to be plotted
#' or TRUE for using the .default = NULL, no text plotted.
#'
#' @return graphs for gsea table or gsea individual set
#'
#' @export
#'
# gsea_res <- gsea_res_batch; de_res <- de_res_rio_2013; gene_sets <- aaegdata::kegg_gene_sets;
# top_sets <- 10; enrichment <- FALSE; title = TRUE
plot_gsea <- function(gsea_res, de_res, gene_sets,
top_sets = 10, enrichment = FALSE,
title = NULL) {
de_res$log2FoldChange[is.na(de_res$log2FoldChange)] <- 0
sorted_res <- de_res %>%
dplyr::select(gene, log2FoldChange) %>%
dplyr::arrange(log2FoldChange)
# ordered from strongest down-regulated to strongest upregulated
ranked_genes <- sorted_res$log2FoldChange
names(ranked_genes) <- sorted_res$gene
if (isTRUE("data.frame" %in% class(de_res))) {
gene_set_col <- base::colnames(gene_sets)[1]
temp_list <- gene_sets %>%
dplyr::group_by(!!as.name(gene_set_col)) %>%
dplyr::summarise(gene = list(gene)) %>%
dplyr::ungroup()
gene_sets_list <- temp_list$gene
names(gene_sets_list) <- temp_list %>%
base::as.data.frame() %>%
.[, gene_set_col]
gene_sets <- gene_sets_list
if (!is.list(gene_sets)) {
if (file_ext == "gmt") {
gene_sets <- fgsea::gmtPathways(gene_sets)
} else {
if (file_ext == "rds") {
gene_sets <- readr::read_rds(gene_sets)
}
}
}
}
gene_sets <- purrr::map(gene_sets, unique)
if (isTRUE(enrichment)) {
if (isTRUE(title)) {
title <- top_sets
}
fgsea::plotEnrichment(
gene_sets[[top_sets]],
ranked_genes
) +
ggplot2::labs(title = title) +
ggpubr::theme_pubr()
# labs(title="Programmed Cell Death")
} else {
top_paths_up <- gsea_res %>%
dplyr::filter(ES > 0) %>%
dplyr::arrange(pvalue) %>%
utils::head(n = top_sets) %>%
dplyr::pull(1)
top_paths_down <- gsea_res %>%
dplyr::filter(ES < 0) %>%
dplyr::arrange(pvalue) %>%
utils::head(n = top_sets) %>%
dplyr::pull(1)
gsea_res <- gsea_res %>%
dplyr::rename(pval = pvalue)
top_paths <- c(top_paths_up, rev(top_paths_down))
fgsea::plotGseaTable(gene_sets[top_paths],
ranked_genes,
gsea_res,
colwidths = c(9, 3, 0.8, 1.2, 1.2)
)
}
}
#' Plot Venn Diagram
#'
#' Plot Venn diagrams, from de_analysis or gsea_analysis results
#' **Currently, this version only plots Venn diagram with four groups and just
#' save files to file, don't plot directly**
#'
#' @param ... de_analysis or gsea_analysis data frames
#'
#' @param pvalue p-value cut-off for enrichment or differential consideration,
#' default = 0.05
#'
#' @param fdr logical, default = FALSE should False discovery rate adjusted
#' pvalue be used instead of pvalue
#'
#' @param names vector with names to be used for each group
#'
#' @param filename path to were the image should be saved
#'
#' @param width value to be used as the width of the plot, the venn diagram
#' will be plotted as a square with side of the smaller
#' value, between width and height
#'
#' @param height value to be used as the height of the plot, the venn diagram
#' will be plotted as a square of the smaller value, between
#' width and height. heigth cam be ommited and the plot will be
#' a square.
#'
#' @export
#'
plot_venn_diagram <- function(..., pvalue = 0.05, fdr = FALSE, names = NULL,
filename, width = 9, height = NULL) {
file_extension <- filename %>%
stringr::str_extract("\\.[a-zA-Z]*$")
if (!isTRUE(identical(file_extension, ".pdf"))) {
# remove to introduce another extensions
stop("filename is not PDF")
}
res_list <- list(...)
if (isTRUE(length(res_list) == 1)) {
if (!is.list(res_list)) {
stop("can't plot a venn diagram for one group")
}
}
if (!is.null(names(res_list))) {
names <- names(res_list)
}
if (is.null(names)) {
} else {
if (isTRUE(identical(length(res_list), length(names)))) {
names(res_list) <- names
} else {
stop("length of names should be the same of the objects passed to '...'")
}
}
file_path <- filename %>% stringr::str_replace(file_extension, "")
file_name_svg <- paste0(file_path, ".svg")
pvalue_var <- dplyr::quo(pvalue)
if (isTRUE(fdr)) {
pvalue_var <- dplyr::quo(padj)
}
pvalue_cutoff <- pvalue
input_list <- res_list %>%
purrr::map(~{
.x %>%
dplyr::filter(!!pvalue_var < pvalue_cutoff) %>%
dplyr::pull(1)
})
names(input_list) <- names
output_height <- height
output_width <- width
if (is.null(height)) {
output_height <- width
}
venn_width <- min(output_height, output_width)
VennDiagram::venn.diagram(
x = input_list,
filename = file_name_svg, imagetype = "svg",
height = venn_width, width = venn_width, col = "transparent",
fill = c("cornflowerblue", "green", "yellow", "darkorchid1"),
label.col = c(
"orange", "white", "darkorchid4", "white", "white",
"white", "white", "white", "darkblue", "white", "white", "white", "white",
"darkgreen", "white"
),
fontfamily = "serif", fontface = "bold",
cat.col = c("darkblue", "darkgreen", "orange", "darkorchid4"), cat.cex = 1.5,
cat.pos = 0, cat.dist = 0.07, cat.fontfamily = "serif", alpha = 0.50, cex = 1.5,
rotation.degree = 270, margin = 0.2
)
if (isTRUE(identical(file_extension, ".pdf"))) {
file_name_pdf <- paste0(file_path, ".pdf")
rsvg::rsvg_pdf(svg = file_name_svg, file = file_name_pdf)
}
}
#' Plot Differentially Expressed Genes Heatmap
#'
#' Plot a expression heatmap of differentially expressed genes de_analysis
#'
#' @param de_res results from de_analysis function
#' @param tx \code{txi} object, object imported with
#' `import_tx()` or data frame wih gene abundance
#'
#' @param pvalue p-value cut-off for enrichment or differential consideration,
#' default = 0.05
#'
#' @param fdr logical, default = FALSE should False discovery rate adjusted
#' pvalue be used instead of pvalue
#'
#' @param row_names variable / coluimn name used for gene / row annotation.
#' Default = "gene" column
#'
#' @export
#'
# de_res <- de_res;tx=imported_transcripts;pvalue = 0.1;fdr = TRUE;
# de_res <- top_degs_res;tx=filtered_tx;pvalue = 0.2;fdr = TRUE;row_names = "hgnc_symbol";
plot_deg_heatmap <- function(de_res, tx, pvalue = 0.05, fdr = FALSE,
row_names = NULL) {
pvalue_cutoff <- pvalue
pvalue_var <- dplyr::quo(pvalue)
if (isTRUE(fdr)) {
pvalue_var <- dplyr::quo(padj)
}
deg_names <- de_res %>%
dplyr::filter(!is.na(!!pvalue_var)) %>%
dplyr::filter(!!pvalue_var <= pvalue_cutoff) %>%
dplyr::pull(gene)
if (isTRUE(is.list(tx))) {
expr_df <- tx$abundance
} else {
stop("Invalid imported tx")
}
heatmap_df <- expr_df %>%
dplyr::as_tibble(rownames = "gene") %>%
dplyr::filter(gene %in% deg_names)
if (!is.null(row_names)) {
row_names_for_join <- de_res %>%
dplyr::select(gene, !!row_names)
heatmap_df <- heatmap_df %>%
dplyr::left_join(row_names_for_join, by = "gene") %>%
dplyr::select( -gene ) %>%
dplyr::rename( gene = !!row_names)
}
# heatmap_df %>%
# dplyr::filter(!is.na(gene))
heatmap_row_names <- heatmap_df %>%
dplyr::pull(gene)
heatmap_df <- heatmap_df %>%
dplyr::select(-gene) %>%
base::as.data.frame()
rownames(heatmap_df) <- heatmap_row_names
## Sorting dendograms
sort_hclust <- function(...) {
stats::as.hclust(dendsort::dendsort(stats::as.dendrogram(...)))
}
mat_cluster_cols <- stats::hclust(stats::dist(t(heatmap_df)))
mat_cluster_cols <- sort_hclust(mat_cluster_cols)
mat_cluster_rows <- sort_hclust(stats::hclust(stats::dist(heatmap_df)))
breaks_value <- base::seq(-3, 3, length.out = 101)
utils::assignInNamespace(
x = "draw_colnames",
value = "draw_colnames_45",
ns = asNamespace("pheatmap")
)
pheatmap::pheatmap(
mat = heatmap_df,
color = grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(
n = 7, name = "PRGn")))(100),
breaks = breaks_value,
show_rownames = TRUE,
border_color = NA,
treeheight_col = 30,
treeheight_row = 30,
cluster_cols = mat_cluster_cols,
cluster_rows = mat_cluster_rows,
scale = "row"
)
}
#' Plot a Heatmap From gsea_analysis
#'
#' Plot a heatmap of enriched gene sets from gsea_analysis
#'
#' @param ... list of gsea_analisys results
#'
#' @param pvalue p-value cut-off for enrichment or differential consideration,
#' default = 0.05
#'
#' @param fdr logical, default = FALSE should False discovery rate adjusted
#' pvalue be used instead of pvalue
#'
#' @param names vector with names to be used at the x axis label
#'
#' @param occurrence Number of samples that the gene set
#' needs to be enriched
#'
#' @export
#'
# gsea_res_list <- list(gsea_go_bot_2014,gsea_go_neo_2014,gsea_go_rio_2013,gsea_go_rio_2015)
# pvalue = 0.2; fdr = TRUE; names = c("Botucatu 2014","Neopolis 2014","Rio 2013","Rio 2015")
# occurrence = NULL
plot_gsea_heatmap <- function(..., pvalue = 0.05, fdr = FALSE, names = NULL,
occurrence = NULL) {
gsea_res_list <- list(...)
pvalue_cutoff <- pvalue
sample_names <- names
pvalue_var <- dplyr::quo(pvalue)
if (isTRUE(fdr)) {
pvalue_var <- dplyr::quo(padj)
}
if (is.null(occurrence)) {
occurrence <- length(gsea_res_list)
}
enriched_gene_sets <- gsea_res_list %>%
purrr::map(~{
.x %>%
dplyr::filter(!!pvalue_var < pvalue_cutoff) %>%
dplyr::pull(1)
})
enriched_vector <- enriched_gene_sets %>%
unlist() %>%
table()
top_sets <- dplyr::data_frame(
gene_sets = names(enriched_vector),
occurrency = enriched_vector
) %>%
dplyr::arrange(-occurrency)
top_sets <- top_sets %>%
dplyr::filter(occurrency >= occurrence) %>%
dplyr::pull(1)
gene_set_var <- gsea_res_list[[1]] %>% colnames()
gene_set_var <- gene_set_var[1]
gene_set_var <- dplyr::sym(gene_set_var)
pvalue_df <- gsea_res_list %>%
purrr::map2_df(sample_names, ~{
.x %>%
dplyr::filter(!!gene_set_var %in% top_sets) %>%
dplyr::select(1, !!pvalue_var) %>%
dplyr::mutate(sample_name = .y)
})
mat_df <- pvalue_df %>%
tidyr::spread(sample_name, !!pvalue_var, -1)
mat_rownames <- mat_df %>%
dplyr::select(1) %>%
dplyr::pull()
mat <- mat_df %>%
dplyr::select(-1) %>%
base::as.matrix()
rownames(mat) <- mat_rownames
mat <- mat %>%
base::log10() %>%
base::abs()
sort_hclust <- function(...) {
stats::as.hclust(dendsort::dendsort(stats::as.dendrogram(...)))
}
mat_cluster_cols <- stats::hclust(stats::dist(t(mat)))
mat_cluster_cols <- sort_hclust(mat_cluster_cols)
mat_cluster_rows <- sort_hclust(stats::hclust(stats::dist(mat)))
utils::assignInNamespace(
x = "draw_colnames",
value = "draw_colnames_45",
ns = asNamespace("pheatmap")
)
# Because -log10(0.001) = 3 and
# if p-value is smaller than 0.01, it's already much significant
p_value_max <- 3
# p_value_max <- max(mat)
pheatmap::pheatmap(
mat = mat,
color = viridis::viridis(100, direction = 1),
show_rownames = TRUE,
border_color = NA,
treeheight_col = 30,
treeheight_row = 30,
cluster_cols = mat_cluster_cols,
cluster_rows = mat_cluster_rows,
breaks = base::seq(0, p_value_max, (p_value_max / 100))
)
}
#' Plot Gene Expression
#'
#' Plot gene expression points
#'
#' @param gene gene or vector of genes
#'
#' @param tx \code{txi} object, object imported with
#' `import_tx()` or data frame wih gene abundance
#'
#' @param sample_table metadata data frame, containing description
#' of each sample and experimental design
#'
#' @param color_by variable from sample table to group samples in the plot
#'
#' @param filter samples to filter, dafault = NULL
#'
#' @export
#'
# tx=imported_expression;gene="AAEL000080";color_by="day"
# tx <- imported_transcripts;gene <- "AAEL006469";sample_table <- samples_metadata;color_by_var <- "condition";
plot_gene_expression <- function(gene, tx, sample_table = NULL,
color_by = NULL, filter = NULL) {
if (isTRUE(is.list(tx))) {
expr_df <- tx$abundance
}
metadata_df <- sample_table
if (is.null(sample_table)) {
metadata_df <- dplyr::data_frame()
} else {
sample_var <- metadata_df %>%
dplyr::select(1) %>%
names()
}
if (!is.null(color_by)) {
color_by_var <- color_by
}
gene_query <- gene
expr_df %>%
dplyr::as_tibble(rownames = "gene") %>%
dplyr::filter(gene %in% gene_query) %>%
tidyr::gather(sample, tpm, -gene) %>%
dplyr::left_join(metadata_df, by = c("sample" = sample_var)) %>%
ggplot2::ggplot() +
ggplot2::geom_point(
ggplot2::aes(sample, tpm, color = !!as.name(color_by_var)),
size = 3
) +
ggpubr::theme_pubr() +
ggplot2::theme(axis.text.x = ggplot2::element_text(
angle = 45, hjust = 1, vjust = 1
)) +
ggplot2::scale_color_viridis_d()
}
#' Plot GSEA Results and Leading Edge
#'
#' plot gsea results with leading edge genes for enriched gene sets
#'
#' @param gsea_res result from gsea_analysis
#'
#' @param pvalue p-value cut-off for enrichment or differential consideration,
#' default = 0.05
#'
#' @param fdr logical, default = FALSE should False discovery rate adjusted
#' pvalue be used instead of pvalue
#'
#' @export
#'
# gsea_res <- gsea_kegg_res; pvalue <- 0.2; fdr = TRUE;
# gsea_res <- gsea_kegg_res; pvalue <- 0.05; fdr = FALSE;
plot_gsea_res <- function(gsea_res, pvalue = 0.05, fdr = FALSE) {
pvalue_cutoff <- pvalue
pvalue_var <- dplyr::quo(pvalue)
pvalue_label <- dplyr::quo(`P-value`)
if (isTRUE(fdr)) {
pvalue_var <- dplyr::quo(padj)
pvalue_label <- dplyr::quo(`Adjusted p-value`)
}
gsea_res_sig <- gsea_res %>%
dplyr::filter(!!pvalue_var < pvalue_cutoff)
leading_edge_number <- c()
for (paths in gsea_res_sig$leading_edge) {
leading_edge_number <- c(leading_edge_number, length(paths))
}
gsea_res_df <- dplyr::data_frame(
gene_set = gsea_res_sig %>% dplyr::pull(1),
pvalue_temp = gsea_res_sig %>% dplyr::pull(!!pvalue_var),
NES = gsea_res_sig$NES,
`Number of LE genes` = leading_edge_number
) %>%
dplyr::arrange(NES)
if (isTRUE(fdr)) {
gsea_res_df <- gsea_res_df %>%
dplyr::rename(`Adjusted p-value` = pvalue_temp)
} else {
gsea_res_df <- gsea_res_df %>%
dplyr::rename(`P-value` = pvalue_temp)
}
# Plotting
plot <- gsea_res_df %>%
ggplot2::ggplot(ggplot2::aes(NES, gene_set)) +
ggplot2::geom_point(
ggplot2::aes(colour = !!pvalue_label, size = `Number of LE genes`)) +
ggplot2::scale_color_gradientn(
colours = viridis::viridis(4, direction = -1),
limits = c(0, pvalue_cutoff)
) +
ggplot2::geom_vline(xintercept = 0, size = 0.5, colour = "gray50") +
ggplot2::theme(
panel.background = ggplot2::element_rect(
fill = "gray95", colour = "gray95"
),
panel.grid.major = ggplot2::element_line(
size = 0.25, linetype = "solid", colour = "gray90"
),
panel.grid.minor = ggplot2::element_line(
size = 0.25, linetype = "solid", colour = "gray90"
),
axis.title.y = ggplot2::element_blank()
) +
ggplot2::expand_limits(x = c(-3, 3)) +
ggplot2::scale_x_continuous(breaks = c(-3, -2, -1, 0, 1, 2, 3)) +
ggplot2::scale_y_discrete(limits = rev(gsea_res_df$gene_set)) +
ggplot2::ylab( "Gene sets") +
ggpubr::theme_pubr()
plot
}
#' Plot Leading Edge Heatmap
#'
#' plot Ledading Edge analysis results enriched gene sets
#'
#' @param gsea_res result from gsea_analysis
#'
#' @param pvalue p-value cut-off for enrichment or differential consideration,
#' default = 0.05
#'
#' @param fdr logical, default = FALSE should False discovery rate adjusted
#' pvalue be used instead of pvalue
#'
#' @param num number of genes to plot, based on the leading edge occurence,
#' if num is negative, it is used the absolute num of genes with
#' lesser occurence, if num is NULL all leading edge elements
#' are used. Default = NULL.
#' @export
#'
# gsea_res <- gsea_kegg_res; pvalue <- 0.2; fdr = TRUE; num <- 30;
# gsea_res <- gsea_reactome_res; pvalue <- 0.05; fdr = FALSE; num <- -20;
# gsea_res <- gsea_go_bp_res_padj; pvalue = 0.05; fdr = FALSE; num <- 100;
plot_le_heatmap <- function(gsea_res, pvalue = 0.05, fdr = FALSE,
num = NULL) {
pvalue_cutoff <- pvalue
# use pvalue or padj
pvalue_var <- dplyr::quo(pvalue)
if (isTRUE(fdr)) {
pvalue_var <- dplyr::quo(padj)
}
# filter gene sets by cutoff
gsea_res_sig <- gsea_res %>%
dplyr::filter(!!pvalue_var < pvalue_cutoff)
#
le_df <- gsea_res_sig %>%
dplyr::select(pathway, leading_edge) %>%
tidyr::unnest() %>%
dplyr::distinct()
# arrange by occurence and pvalue
le_occurence_df <- le_df %>%
dplyr::group_by(leading_edge) %>%
dplyr::summarise(n = dplyr::n()) %>%
dplyr::ungroup() %>%
dplyr::arrange(-n)
# slice `num` rows
if (is.null(num)) {
le_genes <- unique(le_occurence_df$leading_edge)
} else if (isTRUE(num < 0)) {
le_genes <- le_occurence_df %>%
dplyr::slice((dplyr::n() - num):dplyr::n()) %>%
dplyr::pull(leading_edge)
} else if (isTRUE(num > 0)) {
le_genes <- le_occurence_df %>%
dplyr::slice(1:num) %>%
dplyr::pull(leading_edge)
} else {
stop("`num` should be an integer different from 0 or NULL")
}
le_wide_df <- le_df %>%
dplyr::filter(leading_edge %in% le_genes) %>%
tidyr::spread(leading_edge, -pathway) %>%
dplyr::distinct()
le_mat_row_names <- le_wide_df$pathway
le_wide_df <- le_wide_df %>%
dplyr::select(-pathway) %>%
as.data.frame()
for (column in colnames(le_wide_df)) {
le_wide_df[[column]] <- dplyr::if_else(!is.na(le_wide_df[[column]]), 1, 0)
}
rownames(le_wide_df) <- le_mat_row_names
utils::assignInNamespace(
x = "draw_colnames",
value = "draw_colnames_45",
ns = asNamespace("pheatmap")
)
pheatmap::pheatmap(
mat = le_wide_df,
color = viridis::viridis(2, direction = 1),
show_rownames = TRUE,
# border_color = NA,
treeheight_col = 0,
treeheight_row = 0,
drop_levels = TRUE,
cluster_cols = TRUE,
cluster_rows = TRUE,
legend = TRUE,
legend_breaks = c(0.25, 0.75),
legend_labels = c("Not present", "Leading Edge")
)
}
#' Multiple Plot
#'
#' ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
#' - cols: Number of columns in layout
#' - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#' If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
#' then plot 1 will go in the upper left, 2 will go in the upper right, and
#' 3 will go all the way across the bottom.
#' adapted from ggplot2 cookbook.
#'
#' @param ... ggplot objects
#'
#' @param plotlist list of ggplot objects
#'
#' @param cols number of columns in layout
#'
#' @param layout A matrix specifying the layout. If present, 'cols' is ignored
#'
#' @export
#'
multiplot <- function(..., plotlist = NULL, cols = 1, layout = NULL) {
# library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(base::list(...), plotlist)
num_plots <- base::length(plots)
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- base::matrix(seq(1, cols * base::ceiling(num_plots / cols)),
ncol = cols, nrow = base::ceiling(num_plots / cols)
)
}
if (num_plots == 1) {
print(plots[[1]])
} else {
# Set up the page
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), base::ncol(layout))))
# Make each plot, in the correct location
for (i in 1:num_plots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- base::as.data.frame(base::which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = grid::viewport(
layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col
))
}
}
}
###############################################
## #
## Aux Functions #
## #
###############################################
#' Pipe, as in Pipeline
#'
#' Exported from dplyr, use \code{\%>\%} to turn
#' function composition into a series of imperative statements.
#'
#' @importFrom dplyr %>%
#' @name %>%
#' @rdname pipe
#' @export
#' @param lhs,rhs An object and a function to apply to it
#' @examples
#' # Instead of
#' abs(nrow(head(mtcars)))
#' # you can write
#' mtcars %>% head() %>% nrow() %>% abs()
NULL
#' plot aux function
#'
#' Auxiliar function for rotating plot_heatmap x axis
#'
#' @param coln number of columns
#' @param gaps gaps
#' @param ... additional parameters
#' @export
draw_colnames_45 <- function(coln, gaps, ...) {
# Correct labels orientation, thanks to https://slowkow.com/notes/heatmap-tutorial/
find_coordinates_temp <- utils::getFromNamespace("find_coordinates", "pheatmap")
coord <- find_coordinates_temp(length(coln), gaps)
x <- coord$coord - 0.5 * coord$size
res <- grid::textGrob(
coln,
x = x, y = grid::unit(1, "npc") - grid::unit(3, "bigpts"),
vjust = 0.75, hjust = 1, rot = 45, gp = grid::gpar(...)
)
return(res)
}
#' Calculate Variance by Row
#'
#' Calculate variance by row in a matrix or data frame
#'
#' @param x matrix or data frame
#' @param ... additional paramaters addressed to rowSums or rowMeans
#' @export
calc_var_by_row <- function(x, ...) {
base::rowSums((x - base::rowMeans(x, ...))^2, ...) / (base::dim(x)[2] - 1)
}
#' plot aux function 2
#'
#' Prepare data for plotting PCA or Expression heatmap
#' @param tx expression data or result from import_tx
#' @export
prepare_tx_mat <- function(tx) {
if (is.list(tx)) {
if (!dplyr::is.tbl(tx) & !is.data.frame(tx)) {
expression_matrix <- tx$abundance
} else {
expression_matrix <- tx
}
} else {
expression_matrix <- tx
}
if (!is.matrix(expression_matrix)) {
if (!isTRUE(all(!is.na(suppressWarnings(as.numeric(as.matrix(expression_matrix))))))) {
temp_mat <- base::as.matrix(expression_matrix[, 2:length(expression_matrix)])
base::rownames(temp_mat) <- expression_matrix %>%
dplyr::pull(1)
expression_matrix <- temp_mat
}
}
expression_matrix
}
#' Differential Expression results
#'
#' Filter upregulated and downregulated genes from
#' Differential expression analysis in a table
#'
#' @param ... de_analysis or gsea_analysis results data frames
#'
#' @param pvalue p-value cut-off for enrichment or differential consideration,
#' default = 0.05
#'
#' @param fdr logical, default = FALSE should False discovery rate adjusted
#' pvalue be used instead of pvalue
#'
#' @param names vector with names to be used for each group
#'
#' @export
retrieve_deg_table <- function(..., pvalue = 0.05, fdr = FALSE, names = NULL) {
pvalue_var <- dplyr::quo(pvalue)
if (isTRUE(fdr)) {
pvalue_var <- dplyr::quo(padj)
}
pvalue_cutoff <- pvalue
temp_var <- list(...)
if (is.list(temp_var)) {
deg_table_population <- temp_var
}
else {
deg_table_population <- list(temp_var)
}
if (!is.null(base::names(deg_table_population))) {
names <- base::names(deg_table_population)
}
deg_table_population <- deg_table_population %>%
purrr::map_df(~{
temp_res <- .x
up_deg <- temp_res %>%
dplyr::filter(!!pvalue_var < pvalue_cutoff) %>%
dplyr::filter(`log2FoldChange` > 0) %>%
nrow()
down_deg <- temp_res %>%
dplyr::filter(!!pvalue_var < pvalue_cutoff) %>%
dplyr::filter(`log2FoldChange` < 0) %>%
nrow()
dplyr::data_frame(
up = up_deg,
down = down_deg,
total = up_deg + down_deg
)
})
deg_table_population %>%
dplyr::mutate(sample = names) %>%
dplyr::select(sample, up, down, total)
}
#' Leading Edge Genes By Gene Set
#'
#' Retrieve table with leading edge genes by gene set, for multiple
#' experiments the column occurrence will have the number of hits
#'
#' @param ... gsea_analysis results data frames
#'
#' @param names vector with names to be used for each group
#'
#' @export
# gsea_res_list <- list(gsea_complex_bot_2014,gsea_complex_neo_2014,gsea_complex_rio_2013,gsea_complex_rio_2015)
# names <- names <- c("Botucatu 2014", "Neopolis 2014", "Rio 2013", "Rio 2015")
retrieve_le_table <- function(..., names = NULL) {
gsea_res_list <- list(...)
if (is.null(names)) {
names <- ""
}
le_res <- gsea_res_list %>%
purrr::map2_df(names, ~{
gene_set_var <- .x %>%
dplyr::select(1) %>%
names()
sample_temp <- .y
.x %>%
tidyr::unnest() %>%
dplyr::select(!!gene_set_var, `leading_edge`) %>%
dplyr::mutate(sample = sample_temp)
})
le_res %>%
dplyr::group_by(!!as.name(gene_set_var), `leading_edge`) %>%
dplyr::summarise(occurrence = n()) %>%
dplyr::arrange(!!as.name(gene_set_var), -occurrence) %>%
dplyr::ungroup()
}
# tx <- imported_transcripts
#tidy_tx <- function(tx){
# tx[[3]]
#}
#tidy_tx(imported_transcripts)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.