#' Parse inputs (from aracne-ap_viper Nextflow)
#' @param NETWORK ARACNe_AP network file
#' @param EXPRMAT expression matrix in tab-separated format:
#' header line in format:
#' geneID sample_1 ... sampleID_n
#' all other lines in format
#' geneID_1 value_1 ... value_n etc.
#' @param METADATA metadata to discriminate between cohorts for msViper analysis;
#' tab-separated format; header line format:
#' sample group
#' all other lines:
#' sampleID_1 group_0 etc.
#' @param genome_prefix string of a genome in biomart$dataset, suffixed with
#' '_gene_ensembl'
#' @param TAG string identifier of the run
#' @return none, saves two RData files to current dir contianing:
#' ensembl to external gene mapping (en2ext);
#' combined.eset, regulonaracne required to run msViper
#' @export
parse_aracne <- function(NETWORK, EXPRMAT, METADATA, TAG, genome_prefix = "hsapiens"){
##process annotation data
tx2gene <- RNAseqon::get_tx2gene(genome_prefix)
##ARACNe regulon
print("ARACNe-AP to regulon...")
regulonaracne <- suppressWarnings(viper::aracne2regulon(afile = NETWORK,
eset = EXPRMAT))
## create eset
print("Parsing expression matrix...")
exprd <- readr::read_tsv(EXPRMAT)
##check geneID as external_gene_name, then convert to ensembl_gene_id
exprdg <- dplyr::left_join(tx2gene, exprd) %>%
dplyr::select(external_gene_name, ensembl_gene_id, where(is.numeric))
exprdh <- RNAseqon::highest_exp_wides(wide_object = exprdg,
name_col = "external_gene_name",
id_col = "ensembl_gene_id",
value_cols = 3:dim(exprdg)[2])
print("Processing metadata...")
pheno <- readr::read_tsv(METADATA)
sumatch <- sum(match(colnames(pheno), c("sampleID", "group")))
if(sumatch != 3 | is.na(sumatch)){
print("Converting metadata colnames, ensure they are in sample, group order or msViper may fail!")
colnames(pheno) <- c("sampleID", "group")
}
dot_dash <- ifelse(length(strsplit(colnames(exprd)[3], "-")[[1]]),"-", ".")
phenodf <- pheno %>%
dplyr::mutate(sampleID = gsub("-", dot_dash, sampleID)) %>%
dplyr::filter(sampleID %in% colnames(exprd)) %>%
dplyr::rename(description = group) %>%
as.data.frame()
exprds <- exprdh %>%
dplyr::select(-external_gene_name) %>%
as.data.frame() %>%
tibble::column_to_rownames(., var = "ensembl_gene_id")
phenos <- phenodf[phenodf$sampleID %in% colnames(exprds),]
print("Creating ExpressionSet...")
pData <- new("AnnotatedDataFrame",
data=data.frame(row.names = colnames(exprds),
description = factor(phenos$description)))
combined.eset <- Biobase::ExpressionSet(assayData = as.matrix(exprds),
phenoData = pData)
print("Saving...")
save(combined.eset, tx2gene, phenos, exprds, exprdh, regulonaracne, file = paste0(TAG, ".parse_inputs.RData"))
}
#' Run msViper
#' @param TAG string identifier of the run
#' @param RDATA RData file generated by parse_input()
#' @param genome_prefix string of a genome in biomart$dataset, suffixed with '_gene_ensembl'
#' @param msigdb_species one of msigdbr::msigdbr_species()
#' @return none, saves RData files of output
#' @export
run_msviper <- function(TAG, RDATA, genome_prefix, msigdb_species){
##load parse_input RData
load(RDATA)
##generate signature for pairwise description levels
LEVELS <- levels(factor(phenos$description))
lapply(seq_along(LEVELS), function(f){
level1 <- LEVELS[f]
print(paste0("Working on: ", level1))
##require 10+ samples in Group
if(table(phenos$description %in% level1)[2] > 7){
pairname <- paste0(gsub(" ","-",level1), "_vs_rest")
signature <- viper::rowTtest(combined.eset, "description", level1)
rownamesp <- rownames(signature$p.value)
signature$p.value <- as.matrix(p.adjust(signature$p.value, method="BH"))
rownames(signature$p.value) <- rownamesp
signature <- (qnorm(signature$p.value/2, lower.tail = FALSE) * sign(signature$statistic))[, 1]
siginf <- signature[!signature=="-Inf"]
nullmodel <- viper::ttestNull(combined.eset, "description", level1, LEVELS[! LEVELS %in% level1], per = 5000, repos = TRUE, verbose = TRUE)
regulonaracneinf <- regulonaracne[names(regulonaracne) %in% names(siginf)]
## msViper, leading edge analyses + bootstrapped
mra <- viper::msviper(ges = siginf,
regulon = regulonaracne,
nullmodel = nullmodel,
verbose = FALSE)
mra <- viper::ledge(mra)
mra <- viper::shadow(mobj = mra,
regulators = 0.01,
shadow = 0.01,
per = 5000,
verbose = FALSE)
mrac <- viper::msviperCombinatorial(mobj = mra,
regulators = 0.01,
verbose = FALSE)
##if statement to deny error from synergy analysis which otherwise exits from the script
if(length(mrac$regulon) > length(mra$regulon)){
mra <- viper::msviperSynergy(mobj = mrac, verbose = FALSE)
##BH adjust p-values
mra$es$q.value <- p.adjust(mra$es$p.value, method = "BH")
}
if(length(mrac$regulon) == length(mra$regulon)){
##BH adjust p-values
mra$es$q.value <- p.adjust(mra$es$p.value, method = "BH")
}
## outputs
##omit NaN as throws error
mra$signature <- na.omit(mra$signature)
pdf(paste0(TAG, ".", pairname, ".msViper.syn.results.pdf"))
plot(mra, pval = mra$es$q.value, cex = .7)
dev.off()
mra_tb <- data.frame(NES = mra$es$nes,
Size = mra$es$size,
p.value = mra$es$p.value,
q.value = mra$es$q.value)
readr::write_tsv(tibble::as_tibble(mra_tb, rownames = "ensembl_gene_id"), file = paste0(TAG, ".", pairname, ".msViper.results.tsv"))
save(mra, mra_tb, file = paste0(TAG, ".", pairname, ".msViper.RData"))
##fgsea ssgsea
fgsea_ssgsea_msviper(mra = mra,
mra_stat = "p.value",
genome_prefix,
msigdb_species,
msigdb_cat = "H",
tag = TAG,
exprdh = exprdh,
tx2gene = tx2gene,
gene_col = "external_gene_name",
output_dir = "./",
sig_val = 0.1,
per_regulon = TRUE,
phenos = phenos)
}
})
}
# ##filter out significant results (p-value)
# #'
# #' @param RDATA object
# #' @param ENS2EXT expression matrix in tab-separated format:
# #' header line in format:
# #' geneID sample_1 ... sampleID_n
# #' all other lines in format
# #' geneID_1 value_1 ... value_n etc.
# #' @param PVAL metadata to discriminate between cohorts for msViper analysis;
# #' tab-separated format; header line format:
# #' sample group
# #' all other lines:
# #' sampleID_1 group_0 etc.
# #' @return none, saves two RData files to current dir contianing:
# #' ensembl to external gene mapping (en2ext);
# #' combined.eset, regulonaracne required to run msViper
# #' @export
#
# filter_sig_res <- function(RDATA, ENS2EXT = "ens2ext.RData", PVAL = NULL){
#
# ##load annotation
# load(ENS2EXT)
#
# if(is.null(PVAL)){
# PVAL <- 0.05
# }
#
# for(x in 1:length(RDATA)){
# print(paste0("Loading: ", RDATA[x]))
# load(RDATA[x])
# }
#
# les <- ls(pattern="vs")
# got <- get(les)
# names(got) <- les
# lapply(seq_along(got), function(ff){
# f <- got[[ff]]
# signames <- names(f$es$p.value[f$es$p.value<PVAL])
# if(length(signames>0)){
# sigout <- ens2ext %>% dplyr::filter(ensembl_gene_id %in% signames | external_gene_name %in% signames)
# sigout$size <- f$es$size[names(f$es$size) %in% signames]
# sigout$pvalue <- f$es$p.value[names(f$es$p.value) %in% signames]
# sigout$qvalue <- f$es$q.value[names(f$es$q.value) %in% signames]
# outname <- gsub("RData", "sig.tsv", grep("vs",RDATA,value=T)[ff])
# write_tsv(sigout, path=outname)
# }
# if(length(signames==0)){
# print(paste0("No significant genes found at p < ", PVAL))
# }
# })
# }
#' The situation arises where a single gene name maps to multiple gene identifiers
#' e.g. ENSG00000271503, ENSG00000274233 both map to CCL5
#' this function takes wide format input with at least 3 rows:
#' 1: gene name
#' 2: gene identifier
#' 3: expression value(s) [can be index of columns]
#' function returns a wide format object with highest expression name+id
#' also returns full mapping info and mean expression value per name, id pairs
#'
#' @param wide_object a wide tibble or df
#' @param name_col naming column (gene name)
#' @param id_col identification column (gene ID, i.e. ENSID)
#' @param value_col expression value column (can be index)
#' @importFrom magrittr '%>%'
#' @return wide format with higehst exp. when multiple genes same named
#' @export
highest_exp_wides <- function(wide_object, name_col, id_col, value_cols){
##create mean_value of all rows, then count name_col
##filter out all with name_col single mapping
mean_map <- wide_object %>% dplyr::select(name_col, id_col, value_cols) %>%
dplyr::mutate(mean_value = rowMeans(.[value_cols])) %>%
dplyr::group_by(dplyr::across(name_col)) %>%
na.omit(mean_value) %>%
dplyr::add_count() %>%
dplyr::distinct()
mm_1 <- mean_map %>% dplyr::filter(n == 1) %>%
dplyr::filter(mean_value == max(mean_value)) %>%
dplyr::ungroup() %>%
dplyr::distinct(dplyr::across(-id_col), .keep_all = TRUE) %>%
dplyr::select(-n, -mean_value)
##remove the higher ENSG00000 value, these are on patches in GRCh38
str_no <- function(x){
lapply(x, function(f){strsplit(f, "ENSG000")[[1]][2]})
}
mm_gt1 <- mean_map %>% dplyr::filter(n > 1) %>%
dplyr::filter(mean_value == max(mean_value)) %>%
dplyr::mutate(ensembl_gene_id_no = as.numeric(str_no(ensembl_gene_id))) %>%
dplyr::filter(ensembl_gene_id_no == min(ensembl_gene_id_no)) %>%
dplyr::ungroup() %>%
dplyr::distinct(dplyr::across(-id_col), .keep_all = TRUE) %>%
dplyr::select(-n, -mean_value, -ensembl_gene_id_no)
##return the two sets, bound
mean_map <- dplyr::bind_rows(mm_gt1, mm_1)
return(mean_map)
}
#' Make required inputs for ARACNe-AP which is now supported through RNAseqon
#' @param tpm_tb tpms
#' @param metadata metadatas
#' @param meta_group metadatas group
#' @param tag string
#' @importFrom magrittr '%>%'
#' @return wide format with higehst exp. when multiple genes same named
#' @export
make_aracne_inputs <- function(tpm_tb, metadata, meta_group, tag){
##from *.de_ready.RData
exprmat <- tpm_tb %>% dplyr::select(-external_gene_name)
##
# regulators <- de_res %>% dplyr::select(ensembl_gene_id) %>%
# as.data.frame()
##
metaData <- metadata %>% dplyr::select(1, !!meta_group)
readr::write_tsv(exprmat, file=paste0(tag, ".exprmat.tpm.tsv"))
# readr::write_tsv(regulators, file=paste0(tag, ".regulators.txt"))
readr::write_tsv(metaData, file=paste0(tag, ".metaData.txt"))
}
#' Run fgsea on each regulon set individually
#'
#' @param mra msviper class object with es, ledge
#' @param mra_stat statistic used to rank data, comes from msViper
#' @param genome_prefix string of a genome in biomart$dataset, suffixed with '_gene_ensembl'
#' @param msigdb_species one of msigdbr::msigdbr_species(), default:"Homo sapiens"
#' @param msigdb_cat one of 'c("H", paste0("C", c(1:7)))', see: gsea-msigdb.org/gsea/msigdb/collections.jsp
#' @param tag string used to prefix output
#' @param exprdh expression data
#' @param tx2gene annotation data
#' @param gene_col the name of the column in tibble with gene names found in pathways, set to "rownames" if they are the rownames (default)
#' @param output_dir path to where output goes
#' @param sig_val significance used in analysis
#' @param per_regulon flag to indicate if fgsea is at collective regulon level,
#' or if each regulon and the genes it is co-expressed with are used
#' @param phenos metadata tibble
#' @importFrom magrittr '%>%'
#' @return sure
#' @export
fgsea_ssgsea_msviper <- function(mra, mra_stat = "p.value", genome_prefix, msigdb_species = "Homo sapiens", msigdb_cat = "H", tag, exprdh, tx2gene, gene_col = "external_gene_name", output_dir, sig_val = 0.1, per_regulon = NULL, phenos) {
print("Running: fgsea_ssgsea_msviper()")
##output dir
lapply(c("fgsea", "ssgsea"), function(ff){
lapply(c("data", "plots"), function(f){
dir.create(paste0(output_dir, "/", ff, "/", f),
recursive = TRUE,
showWarnings = FALSE)
})
})
##MsigDB pathways genelists
msigdb_pathlist <- msigdb_pathways_to_list(msigdb_species, msigdb_cat)
##create ranks
if(is.null(per_regulon)){
rank_vec <- tibble::as_tibble(data.frame(stat = mra$es[["nes"]]), rownames = "ensembl_gene_id") %>%
dplyr::left_join(., tx2gene[,c(2,3)]) %>%
dplyr::select(external_gene_name, stat) %>%
dplyr::distinct() %>%
na.omit() %>%
tibble::deframe()
##run, order fgsea
fgsea_res <- fgsea::fgsea(pathways = msigdb_pathlist,
stats = rank_vec,
nperm = 1000000)
##make tibble and add FDR, filters
fgsea_res_tb <- dplyr::mutate(.data = tibble::as_tibble(fgsea_res))
fgsea_res_sig_tb <- dplyr::filter(.data = fgsea_res_tb, padj < !!sig_val)
fgsea_res_sig_tb <- dplyr::arrange(.data = fgsea_res_sig_tb, desc(NES))
gg_fgsea <- fgsea_plot(fgsea_res_tb = fgsea_res_sig_tb, msigdb_cat = msigdb_cat)
ggplot2::ggsave(gg_fgsea, file = paste0(output_dir, "/fgsea/plots/", tag , ".fgsea_sig_viper.ggplot2.pdf"))
save(fgsea_res_tb, file = paste0(output_dir, "/fgsea/data/", tag , ".fgsea_sig_viper.RData"))
} else {
##get those sweet rotation ssGSEA plots
sigulons <- names(mra$regulon)[mra$es$p.value < sig_val]
sigulons <- sigulons[sigulons %in% tx2gene$ensembl_gene_id]
sigunames <- dplyr::filter(.data = dplyr::distinct(tx2gene[,c(2,3)]), ensembl_gene_id %in% sigulons)
sigunames <- c(dplyr::select(.data = sigunames, external_gene_name))
fgsea_res_sigulon_list <- lapply(sigulons, function(f){
print(f)
tfmode <- mra$regulon[[f]]$tfmode
rank_vec <- tibble::as_tibble(tfmode, rownames = "ensembl_gene_id") %>%
dplyr::left_join(., dplyr::distinct(tx2gene[,c(2,3)])) %>%
dplyr::select(external_gene_name, stat = "value") %>%
dplyr::distinct() %>%
na.omit() %>%
tibble::deframe()
##run, order fgsea
fgsea_res <- fgsea::fgsea(pathways = msigdb_pathlist,
stats = rank_vec,
nperm = 1000000)
##make tibble and add FDR, filters
fgsea_res_tb <- dplyr::mutate(.data = tibble::as_tibble(fgsea_res))
fgsea_res_sig_tb <- dplyr::filter(.data = fgsea_res_tb, padj < !!sig_val)
if(dim(fgsea_res_sig_tb)[1] > 0){
gg_fgsea <- fgsea_plot(fgsea_res_tb = fgsea_res_sig_tb, msigdb_cat = msigdb_cat)
ggplot2::ggsave(gg_fgsea, file = paste0(output_dir, "/fgsea/plots/", tag , ".fgsea_sig_viper.ggplot2.pdf"))
save(fgsea_res_tb, gg_fgsea, file = paste0(output_dir, "/fgsea/data/", tag , ".fgsea_sig_viper.RData"))
}
return(list(rank_names = names(rank_vec), fgsea_res = fgsea_res))
})
names(fgsea_res_sigulon_list) <- unlist(sigunames)
save(fgsea_res_sigulon_list, file = paste0(output_dir, "/fgsea/data/", tag , ".per_regulon_viper.RData"))
##do all sigulons at once, i.e. all genes in sigulon genesets
sigulon_s <- unlist(mra$regulon[mra$es$p.value < sig_val])
sigulon_tfmode <- sigulon_s[grep("tfmode", names(sigulon_s))]
sigulon_set <- sigulon_tfmode
names(sigulon_set) <- unlist(lapply(names(sigulon_set), function(f){
strsplit(f, "\\.")[[1]][3]
}))
rank_vec_sigulon_tb <- tibble::enframe(sigulon_set) %>%
dplyr::rename(ensembl_gene_id = "name") %>%
dplyr::left_join(., dplyr::distinct(tx2gene[,c(2,3)])) %>%
dplyr::select(external_gene_name, ensembl_gene_id, value) %>%
dplyr::distinct() %>%
na.omit() %>%
dplyr::arrange(external_gene_name) %>% dplyr::group_by(external_gene_name, ensembl_gene_id) %>% dplyr::summarize(stat = mean(value, na.rm = TRUE)) %>%
dplyr::ungroup()
rank_vec_sigulon_set <- tibble::deframe(rank_vec_sigulon_tb[,c("external_gene_name", "stat")])
##run, order fgsea
fgsea_res_sigulon_set <- fgsea::fgsea(pathways = msigdb_pathlist,
stats = rank_vec_sigulon_set,
nperm = 1000000)
fgsea_sig_sigulon_set <- fgsea_res_sigulon_set %>%
dplyr::filter(pval < sig_val)
##output results per gene
pathways_sig_res_tb <- msigdb_pathlist[names(msigdb_pathlist) %in% fgsea_sig_sigulon_set$pathway] %>%
tibble::enframe("pathway", dplyr::all_of("external_gene_name")) %>%
tidyr::unnest(cols = "external_gene_name") %>%
dplyr::inner_join(rank_vec_sigulon_tb, by = "external_gene_name") %>%
dplyr::select(-stat) %>%
dplyr::distinct()
##which regulons control which genes in those pathways?
reg_strsp <- function(x, y){
unlist(lapply(x, function(f){
strsplit(unlist(f), "\\.")[[1]][[y]]
}))}
regulon_ext_pway_tb <- tibble::enframe(sigulon_tfmode) %>%
dplyr::mutate(regulon = reg_strsp(name, 1),
ensembl_gene_id = reg_strsp(name, 3)) %>%
dplyr::select(regulon, ensembl_gene_id) %>%
dplyr::left_join(pathways_sig_res_tb, .) %>%
dplyr::select(regulon, pathway, external_gene_name, ensembl_gene_id) %>%
dplyr::arrange(regulon)
##do ssgsea
add_small <- function(f){f+0.0000001}
log2_tpm_mat <- dplyr::mutate(.data = exprdh, dplyr::across(where(is.numeric), add_small)) %>%
dplyr::mutate(dplyr::across(where(is.numeric), log2)) %>%
dplyr::select(-ensembl_gene_id) %>%
dplyr::distinct() %>%
tibble::column_to_rownames("external_gene_name") %>%
as.matrix()
metadata <- phenos %>% dplyr::rename(sample = "sampleID", group = "description")
pways <- lapply(fgsea_res_sigulon_list, function(f){
return(f$rank_names)
})
names(pways) <- sigunames$external_gene_name
if(length(fgsea_res_sigulon_list) > 1){
ssgsea_pca_list <- RNAseqon::ssgsea_pca(pways = pways,
log2tpm_mat = log2_tpm_mat,
msigdb_cat = msigdb_cat,
output_dir = output_dir,
contrast = paste0(tag, " msViper Regulon Genesets"),
metadata = metadata)
names(ssgsea_pca_list) <- sigunames
save(ssgsea_pca_list, pways, log2_tpm_mat, metadata, file = paste0(output_dir, "/ssgsea/data/", tag , ".ssgsea_pca_viper.RData"))
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.