R/aracne-ap_viper.R

Defines functions fgsea_ssgsea_msviper make_aracne_inputs highest_exp_wides run_msviper parse_aracne

Documented in fgsea_ssgsea_msviper highest_exp_wides make_aracne_inputs parse_aracne run_msviper

#' Parse inputs (from aracne-ap_viper Nextflow)
#' @param NETWORK ARACNe_AP network file
#' @param EXPRMAT expression matrix in tab-separated format:
#'                header line in format:
#'                geneID sample_1 ... sampleID_n
#'                all other lines in format
#'                geneID_1 value_1 ... value_n etc.
#' @param METADATA metadata to discriminate between cohorts for msViper analysis;
#'                 tab-separated format; header line format:
#'                 sample group
#'                 all other lines:
#'                 sampleID_1 group_0 etc.
#' @param genome_prefix string of a genome in biomart$dataset, suffixed with
#'                      '_gene_ensembl'
#' @param TAG string identifier of the run
#' @return none, saves two RData files to current dir contianing:
#'         ensembl to external gene mapping (en2ext);
#'         combined.eset, regulonaracne required to run msViper
#' @export

parse_aracne <- function(NETWORK, EXPRMAT, METADATA, TAG, genome_prefix = "hsapiens"){

  ##process annotation data
  tx2gene <- RNAseqon::get_tx2gene(genome_prefix)

  ##ARACNe regulon
  print("ARACNe-AP to regulon...")
  regulonaracne <- suppressWarnings(viper::aracne2regulon(afile = NETWORK,
                                                          eset = EXPRMAT))

  ## create eset
  print("Parsing expression matrix...")
  exprd <- readr::read_tsv(EXPRMAT)

  ##check geneID as external_gene_name, then convert to ensembl_gene_id
  exprdg <- dplyr::left_join(tx2gene, exprd) %>%
            dplyr::select(external_gene_name, ensembl_gene_id, where(is.numeric))

  exprdh <- RNAseqon::highest_exp_wides(wide_object = exprdg,
                                        name_col = "external_gene_name",
                                        id_col = "ensembl_gene_id",
                                        value_cols = 3:dim(exprdg)[2])

  print("Processing metadata...")
  pheno <- readr::read_tsv(METADATA)
  sumatch <- sum(match(colnames(pheno), c("sampleID", "group")))

  if(sumatch != 3 | is.na(sumatch)){
    print("Converting metadata colnames, ensure they are in sample, group order or msViper may fail!")
    colnames(pheno) <- c("sampleID", "group")
  }

  dot_dash <- ifelse(length(strsplit(colnames(exprd)[3], "-")[[1]]),"-", ".")
  phenodf <- pheno %>%
             dplyr::mutate(sampleID = gsub("-", dot_dash, sampleID)) %>%
             dplyr::filter(sampleID %in% colnames(exprd)) %>%
             dplyr::rename(description = group) %>%
             as.data.frame()
  exprds <- exprdh %>%
            dplyr::select(-external_gene_name) %>%
            as.data.frame() %>%
            tibble::column_to_rownames(., var = "ensembl_gene_id")
  phenos <- phenodf[phenodf$sampleID %in% colnames(exprds),]
  print("Creating ExpressionSet...")
  pData <- new("AnnotatedDataFrame",
                data=data.frame(row.names = colnames(exprds),
                                description = factor(phenos$description)))
  combined.eset <- Biobase::ExpressionSet(assayData = as.matrix(exprds),
                                          phenoData = pData)
  print("Saving...")
  save(combined.eset, tx2gene, phenos, exprds, exprdh, regulonaracne, file = paste0(TAG, ".parse_inputs.RData"))
}

#' Run msViper
#' @param TAG string identifier of the run
#' @param RDATA RData file generated by parse_input()
#' @param genome_prefix string of a genome in biomart$dataset, suffixed with '_gene_ensembl'
#' @param msigdb_species one of msigdbr::msigdbr_species()
#' @return none, saves RData files of output
#' @export

run_msviper <- function(TAG, RDATA, genome_prefix, msigdb_species){

  ##load parse_input RData
  load(RDATA)

  ##generate signature for pairwise description levels
  LEVELS <- levels(factor(phenos$description))
  lapply(seq_along(LEVELS), function(f){

    level1 <- LEVELS[f]
    print(paste0("Working on: ", level1))

    ##require 10+ samples in Group
    if(table(phenos$description %in% level1)[2] > 7){
      pairname <- paste0(gsub(" ","-",level1), "_vs_rest")
      signature <- viper::rowTtest(combined.eset, "description", level1)
      rownamesp <- rownames(signature$p.value)
      signature$p.value <- as.matrix(p.adjust(signature$p.value, method="BH"))
      rownames(signature$p.value) <- rownamesp
      signature <- (qnorm(signature$p.value/2, lower.tail = FALSE) * sign(signature$statistic))[, 1]
      siginf <- signature[!signature=="-Inf"]
      nullmodel <- viper::ttestNull(combined.eset, "description", level1, LEVELS[! LEVELS %in% level1], per = 5000, repos = TRUE, verbose = TRUE)
      regulonaracneinf <- regulonaracne[names(regulonaracne) %in% names(siginf)]
      ## msViper, leading edge analyses + bootstrapped
      mra <- viper::msviper(ges = siginf,
                            regulon = regulonaracne,
                            nullmodel = nullmodel,
                            verbose = FALSE)
      mra <- viper::ledge(mra)
      mra <- viper::shadow(mobj = mra,
                           regulators = 0.01,
                           shadow = 0.01,
                           per = 5000,
                           verbose = FALSE)
      mrac <- viper::msviperCombinatorial(mobj = mra,
                                          regulators = 0.01,
                                          verbose = FALSE)

      ##if statement to deny error from synergy analysis which otherwise exits from the script
      if(length(mrac$regulon) > length(mra$regulon)){
        mra <- viper::msviperSynergy(mobj = mrac, verbose = FALSE)
        ##BH adjust p-values
        mra$es$q.value <- p.adjust(mra$es$p.value, method = "BH")
      }
      if(length(mrac$regulon) == length(mra$regulon)){
        ##BH adjust p-values
        mra$es$q.value <- p.adjust(mra$es$p.value, method = "BH")
      }

      ## outputs
      ##omit NaN as throws error
      mra$signature <- na.omit(mra$signature)
      pdf(paste0(TAG, ".", pairname, ".msViper.syn.results.pdf"))
        plot(mra, pval = mra$es$q.value, cex = .7)
      dev.off()

      mra_tb <- data.frame(NES = mra$es$nes,
                           Size = mra$es$size,
                           p.value = mra$es$p.value,
                           q.value = mra$es$q.value)

      readr::write_tsv(tibble::as_tibble(mra_tb, rownames = "ensembl_gene_id"), file = paste0(TAG, ".", pairname, ".msViper.results.tsv"))

      save(mra, mra_tb, file = paste0(TAG, ".", pairname, ".msViper.RData"))

      ##fgsea ssgsea
      fgsea_ssgsea_msviper(mra = mra,
                           mra_stat = "p.value",
                           genome_prefix,
                           msigdb_species,
                           msigdb_cat = "H",
                           tag = TAG,
                           exprdh = exprdh,
                           tx2gene = tx2gene,
                           gene_col = "external_gene_name",
                           output_dir = "./",
                           sig_val = 0.1,
                           per_regulon = TRUE,
                           phenos = phenos)

    }
  })
}

# ##filter out significant results (p-value)
# #'
# #' @param RDATA object
# #' @param ENS2EXT expression matrix in tab-separated format:
# #'                header line in format:
# #'                geneID sample_1 ... sampleID_n
# #'                all other lines in format
# #'                geneID_1 value_1 ... value_n etc.
# #' @param PVAL metadata to discriminate between cohorts for msViper analysis;
# #'                 tab-separated format; header line format:
# #'                 sample group
# #'                 all other lines:
# #'                 sampleID_1 group_0 etc.
# #' @return none, saves two RData files to current dir contianing:
# #'         ensembl to external gene mapping (en2ext);
# #'         combined.eset, regulonaracne required to run msViper
# #' @export
#
# filter_sig_res <- function(RDATA, ENS2EXT = "ens2ext.RData", PVAL = NULL){
#
#   ##load annotation
#   load(ENS2EXT)
#
#   if(is.null(PVAL)){
#     PVAL <- 0.05
#   }
#
#   for(x in 1:length(RDATA)){
#     print(paste0("Loading: ", RDATA[x]))
#     load(RDATA[x])
#   }
#
#   les <- ls(pattern="vs")
#   got <- get(les)
#   names(got) <- les
#   lapply(seq_along(got), function(ff){
#   f <- got[[ff]]
#   signames <- names(f$es$p.value[f$es$p.value<PVAL])
#   if(length(signames>0)){
#     sigout <- ens2ext %>% dplyr::filter(ensembl_gene_id %in% signames | external_gene_name %in% signames)
#     sigout$size <- f$es$size[names(f$es$size) %in% signames]
#     sigout$pvalue <- f$es$p.value[names(f$es$p.value) %in% signames]
#     sigout$qvalue <- f$es$q.value[names(f$es$q.value) %in% signames]
#     outname <- gsub("RData", "sig.tsv", grep("vs",RDATA,value=T)[ff])
#     write_tsv(sigout, path=outname)
#   }
#   if(length(signames==0)){
#     print(paste0("No significant genes found at p < ", PVAL))
#   }
#   })
# }

#'  The situation arises where a single gene name maps to multiple gene identifiers
#'  e.g. ENSG00000271503, ENSG00000274233 both map to CCL5
#'  this function takes wide format input with at least 3 rows:
#'  1: gene name
#'  2: gene identifier
#'  3: expression value(s) [can be index of columns]
#'  function returns a wide format object with highest expression name+id
#'  also returns full mapping info and mean expression value per name, id pairs
#'
#' @param wide_object a wide tibble or df
#' @param name_col naming column (gene name)
#' @param id_col identification column (gene ID, i.e. ENSID)
#' @param value_col expression value column (can be index)
#' @importFrom magrittr '%>%'
#' @return wide format with higehst exp. when multiple genes same named
#' @export

highest_exp_wides <- function(wide_object, name_col, id_col, value_cols){


  ##create mean_value of all rows, then count name_col
  ##filter out all with name_col single mapping
  mean_map <- wide_object %>% dplyr::select(name_col, id_col, value_cols) %>%
                              dplyr::mutate(mean_value = rowMeans(.[value_cols])) %>%
                              dplyr::group_by(dplyr::across(name_col)) %>%
                              na.omit(mean_value) %>%
                              dplyr::add_count() %>%
                              dplyr::distinct()

  mm_1 <- mean_map %>% dplyr::filter(n == 1) %>%
                        dplyr::filter(mean_value == max(mean_value)) %>%
                        dplyr::ungroup() %>%
                        dplyr::distinct(dplyr::across(-id_col), .keep_all = TRUE) %>%
                        dplyr::select(-n, -mean_value)

  ##remove the higher ENSG00000 value, these are on patches in GRCh38
  str_no <- function(x){
    lapply(x, function(f){strsplit(f, "ENSG000")[[1]][2]})
  }
  mm_gt1 <- mean_map %>% dplyr::filter(n > 1) %>%
                         dplyr::filter(mean_value == max(mean_value)) %>%
                         dplyr::mutate(ensembl_gene_id_no = as.numeric(str_no(ensembl_gene_id))) %>%
                                               dplyr::filter(ensembl_gene_id_no == min(ensembl_gene_id_no)) %>%
                         dplyr::ungroup() %>%
                         dplyr::distinct(dplyr::across(-id_col), .keep_all = TRUE) %>%
                         dplyr::select(-n, -mean_value, -ensembl_gene_id_no)

  ##return the two sets, bound
  mean_map <- dplyr::bind_rows(mm_gt1, mm_1)
  return(mean_map)
}

#' Make required inputs for ARACNe-AP which is now supported through RNAseqon
#' @param tpm_tb tpms
#' @param metadata metadatas
#' @param meta_group metadatas group
#' @param tag string
#' @importFrom magrittr '%>%'
#' @return wide format with higehst exp. when multiple genes same named
#' @export

make_aracne_inputs <- function(tpm_tb, metadata, meta_group, tag){

  ##from *.de_ready.RData
  exprmat <- tpm_tb %>% dplyr::select(-external_gene_name)

  ##
  # regulators <- de_res %>% dplyr::select(ensembl_gene_id) %>%
  #                          as.data.frame()

  ##
  metaData <- metadata %>% dplyr::select(1, !!meta_group)

  readr::write_tsv(exprmat, file=paste0(tag, ".exprmat.tpm.tsv"))
  # readr::write_tsv(regulators, file=paste0(tag, ".regulators.txt"))
  readr::write_tsv(metaData, file=paste0(tag, ".metaData.txt"))
}

#' Run fgsea on each regulon set individually
#'
#' @param mra msviper class object with es, ledge
#' @param mra_stat statistic used to rank data, comes from msViper
#' @param genome_prefix string of a genome in biomart$dataset, suffixed with '_gene_ensembl'
#' @param msigdb_species one of msigdbr::msigdbr_species(), default:"Homo sapiens"
#' @param msigdb_cat one of 'c("H", paste0("C", c(1:7)))', see: gsea-msigdb.org/gsea/msigdb/collections.jsp
#' @param tag string used to prefix output
#' @param exprdh expression data
#' @param tx2gene annotation data
#' @param gene_col the name of the column in tibble with gene names found in pathways, set to "rownames" if they are the rownames (default)
#' @param output_dir path to where output goes
#' @param sig_val significance used in analysis
#' @param per_regulon flag to indicate if fgsea is at collective regulon level,
#'        or if each regulon and the genes it is co-expressed with are used
#' @param phenos metadata tibble
#' @importFrom magrittr '%>%'
#' @return sure
#' @export

fgsea_ssgsea_msviper <- function(mra, mra_stat = "p.value", genome_prefix, msigdb_species = "Homo sapiens", msigdb_cat = "H", tag, exprdh, tx2gene, gene_col = "external_gene_name", output_dir, sig_val = 0.1, per_regulon = NULL, phenos) {

  print("Running: fgsea_ssgsea_msviper()")

  ##output dir
  lapply(c("fgsea", "ssgsea"), function(ff){
    lapply(c("data", "plots"), function(f){
      dir.create(paste0(output_dir, "/", ff, "/", f),
                 recursive = TRUE,
                 showWarnings = FALSE)
    })
  })

  ##MsigDB pathways genelists
  msigdb_pathlist <- msigdb_pathways_to_list(msigdb_species, msigdb_cat)

  ##create ranks
  if(is.null(per_regulon)){

    rank_vec <- tibble::as_tibble(data.frame(stat = mra$es[["nes"]]), rownames = "ensembl_gene_id") %>%
                dplyr::left_join(., tx2gene[,c(2,3)]) %>%
                dplyr::select(external_gene_name, stat) %>%
                dplyr::distinct() %>%
                na.omit() %>%
                tibble::deframe()

    ##run, order fgsea
    fgsea_res <- fgsea::fgsea(pathways = msigdb_pathlist,
                             stats = rank_vec,
                             nperm = 1000000)

    ##make tibble and add FDR, filters
    fgsea_res_tb <- dplyr::mutate(.data = tibble::as_tibble(fgsea_res))
    fgsea_res_sig_tb <- dplyr::filter(.data = fgsea_res_tb, padj < !!sig_val)
    fgsea_res_sig_tb <- dplyr::arrange(.data = fgsea_res_sig_tb, desc(NES))

    gg_fgsea <- fgsea_plot(fgsea_res_tb = fgsea_res_sig_tb, msigdb_cat = msigdb_cat)
    ggplot2::ggsave(gg_fgsea, file = paste0(output_dir, "/fgsea/plots/", tag , ".fgsea_sig_viper.ggplot2.pdf"))
    save(fgsea_res_tb, file = paste0(output_dir, "/fgsea/data/", tag , ".fgsea_sig_viper.RData"))

  } else {

    ##get those sweet rotation ssGSEA  plots
    sigulons <- names(mra$regulon)[mra$es$p.value < sig_val]
    sigulons <- sigulons[sigulons %in% tx2gene$ensembl_gene_id]
    sigunames <- dplyr::filter(.data = dplyr::distinct(tx2gene[,c(2,3)]), ensembl_gene_id %in% sigulons)
    sigunames <- c(dplyr::select(.data = sigunames, external_gene_name))

    fgsea_res_sigulon_list <- lapply(sigulons, function(f){
      print(f)
      tfmode <- mra$regulon[[f]]$tfmode
      rank_vec <- tibble::as_tibble(tfmode, rownames = "ensembl_gene_id") %>%
                  dplyr::left_join(., dplyr::distinct(tx2gene[,c(2,3)])) %>%
                  dplyr::select(external_gene_name, stat = "value") %>%
                  dplyr::distinct() %>%
                  na.omit() %>%
                  tibble::deframe()

      ##run, order fgsea
      fgsea_res <- fgsea::fgsea(pathways = msigdb_pathlist,
                               stats = rank_vec,
                               nperm = 1000000)

      ##make tibble and add FDR, filters
      fgsea_res_tb <- dplyr::mutate(.data = tibble::as_tibble(fgsea_res))
      fgsea_res_sig_tb <- dplyr::filter(.data = fgsea_res_tb, padj < !!sig_val)

      if(dim(fgsea_res_sig_tb)[1] > 0){

        gg_fgsea <- fgsea_plot(fgsea_res_tb = fgsea_res_sig_tb, msigdb_cat = msigdb_cat)
        ggplot2::ggsave(gg_fgsea, file = paste0(output_dir, "/fgsea/plots/", tag , ".fgsea_sig_viper.ggplot2.pdf"))
        save(fgsea_res_tb, gg_fgsea, file = paste0(output_dir, "/fgsea/data/", tag , ".fgsea_sig_viper.RData"))
      }

      return(list(rank_names = names(rank_vec), fgsea_res = fgsea_res))

    })

    names(fgsea_res_sigulon_list) <- unlist(sigunames)
    save(fgsea_res_sigulon_list, file = paste0(output_dir, "/fgsea/data/", tag , ".per_regulon_viper.RData"))

    ##do all sigulons at once, i.e. all genes in sigulon genesets
    sigulon_s <- unlist(mra$regulon[mra$es$p.value < sig_val])
    sigulon_tfmode <- sigulon_s[grep("tfmode", names(sigulon_s))]
    sigulon_set <- sigulon_tfmode
    names(sigulon_set) <- unlist(lapply(names(sigulon_set), function(f){
        strsplit(f, "\\.")[[1]][3]
      }))

    rank_vec_sigulon_tb <- tibble::enframe(sigulon_set) %>%
            dplyr::rename(ensembl_gene_id = "name") %>%
            dplyr::left_join(., dplyr::distinct(tx2gene[,c(2,3)])) %>%
            dplyr::select(external_gene_name, ensembl_gene_id, value) %>%
            dplyr::distinct() %>%
            na.omit() %>%
            dplyr::arrange(external_gene_name) %>% dplyr::group_by(external_gene_name, ensembl_gene_id) %>% dplyr::summarize(stat = mean(value, na.rm = TRUE)) %>%
            dplyr::ungroup()

    rank_vec_sigulon_set <- tibble::deframe(rank_vec_sigulon_tb[,c("external_gene_name", "stat")])

    ##run, order fgsea
    fgsea_res_sigulon_set <- fgsea::fgsea(pathways = msigdb_pathlist,
                                         stats = rank_vec_sigulon_set,
                                         nperm = 1000000)
    fgsea_sig_sigulon_set <- fgsea_res_sigulon_set %>%
                             dplyr::filter(pval < sig_val)

    ##output results per gene
    pathways_sig_res_tb <- msigdb_pathlist[names(msigdb_pathlist) %in% fgsea_sig_sigulon_set$pathway] %>%
                           tibble::enframe("pathway", dplyr::all_of("external_gene_name")) %>%
                           tidyr::unnest(cols = "external_gene_name") %>%
                           dplyr::inner_join(rank_vec_sigulon_tb, by = "external_gene_name") %>%
                           dplyr::select(-stat) %>%
                           dplyr::distinct()

    ##which regulons control which genes in those pathways?
    reg_strsp <- function(x, y){
      unlist(lapply(x, function(f){
      strsplit(unlist(f), "\\.")[[1]][[y]]
    }))}
    regulon_ext_pway_tb <- tibble::enframe(sigulon_tfmode) %>%
                            dplyr::mutate(regulon = reg_strsp(name, 1),
                                          ensembl_gene_id = reg_strsp(name, 3)) %>%
                            dplyr::select(regulon, ensembl_gene_id) %>%
                            dplyr::left_join(pathways_sig_res_tb, .) %>%
                            dplyr::select(regulon, pathway, external_gene_name, ensembl_gene_id) %>%
                            dplyr::arrange(regulon)

    ##do ssgsea
    add_small <- function(f){f+0.0000001}
    log2_tpm_mat <- dplyr::mutate(.data = exprdh, dplyr::across(where(is.numeric), add_small)) %>%
                    dplyr::mutate(dplyr::across(where(is.numeric), log2)) %>%
                    dplyr::select(-ensembl_gene_id) %>%
                    dplyr::distinct() %>%
                    tibble::column_to_rownames("external_gene_name") %>%
                    as.matrix()

    metadata <- phenos %>% dplyr::rename(sample = "sampleID", group = "description")

    pways <- lapply(fgsea_res_sigulon_list, function(f){
      return(f$rank_names)
    })
    names(pways) <- sigunames$external_gene_name

    if(length(fgsea_res_sigulon_list) > 1){
      ssgsea_pca_list <- RNAseqon::ssgsea_pca(pways = pways,
                                    log2tpm_mat = log2_tpm_mat,
                                    msigdb_cat = msigdb_cat,
                                    output_dir = output_dir,
                                    contrast = paste0(tag, " msViper Regulon Genesets"),
                                    metadata = metadata)
      names(ssgsea_pca_list) <- sigunames
      save(ssgsea_pca_list, pways, log2_tpm_mat, metadata, file = paste0(output_dir, "/ssgsea/data/", tag , ".ssgsea_pca_viper.RData"))
    }
  }
}
brucemoran/RNAseqR documentation built on Sept. 14, 2021, 11:08 a.m.