dnapath2: Differential Network Analysis Using Gene Pathways

Documented in combine_overlapping_pathways get_reactome_pathways

#' Obtain reactome pathways
#'
#' @param species A string, for example "Homo sapiens" or "Mus musculus", 
#' indicating the species to use. 
#' @param overlap_limit (Optional) Any pathways that have an overlap
#' greater than overlap_limit are combined. Set to NULL to disable
#' this option.
#' @return A named list of vectors. Each vector corresponds to a Reactome pathway
#' and contains the entrezgene IDs of the genes in that pathway.
#' @export
get_reactome_pathways <- function(species, overlap_limit = 0.9) {
  
  cat("Obtaining reactome pathway information for species:", species, "\n")
  # Get list of reactome pathway names
  # Map names to ID
  # Map ID to entrez ID
  # Save results as p by g matrix. (Optional; not performed.)
  
  # library(reactome.db)
  # Map reactome NAME to reactome ID. Subset on pathways for this species.
  reactome_to_id <- as.list(reactome.db::reactomePATHNAME2ID)
  available_species <- unique(
    gsub("(: .*)", "", names(as.list(reactome.db::reactomePATHNAME2ID))))
  index <- which(grepl(species,
                       names(as.list(reactome.db::reactomePATHNAME2ID))))
  if(length(index) == 0) {
    stop(species, "is not an available species. Use one of the following:\n\t", 
         paste(available_species, collapse = ", "), ".")
  }
  reactome_to_id <- reactome_to_id[index]
  
  # Map reactome ID to entrez ID.
  id_to_entrez <- as.list(reactome.db::reactomePATHID2EXTID)
  
  # Subset on reactome IDs obtained in first step.
  index <- which(names(id_to_entrez) %in% sapply(reactome_to_id, dplyr::first))
  id_to_entrez <- id_to_entrez[index]
  
  # Remove any pathways containing fewer than 5 genes.
  sizes <- sapply(id_to_entrez, function(x) length(unique(x)))
  id_to_entrez <- id_to_entrez[sizes >= 5]
  
  # Map reactome ID back to reactome NAME.
  id_to_reactome <- as.list(reactome.db::reactomePATHID2NAME)
  
  # Subset on reactome IDs obtained in previous step.
  index <- which(names(id_to_reactome) %in% names(id_to_entrez))
  id_to_reactome <- id_to_reactome[index]
  
  # Finally, map reactome NAME to entrez ID
  pathway_list <- id_to_entrez
  names(pathway_list) <- sapply(names(id_to_entrez), function(x) {
    dplyr::first(id_to_reactome[[which(names(id_to_reactome) == x)[1]]])
  })
  names(pathway_list) <- gsub("^[ A-Za-z]*: ", "", names(pathway_list))
  
  if(!is.null(overlap_limit)) {
    cat("Combining pathways that have greater than", overlap_limit, "overlap.\n")
    pathway_list <- combine_overlapping_pathways(pathway_list, overlap_limit)
  }
  return(pathway_list)
}

#' Modify a pathway list to combine overlapping pathways.
#'
#' @param pathway_list A list of pathways obtained from
#' \code{\link{get_reactome_pathways}}.
#' @param threshold A percentage between 0 and 1. If two pathways
#' overlap by more than this amount, they are combined into one pathway.
#' @return A modified list with overlapping pathways combined together.
#' @export
combine_overlapping_pathways <- function(pathway_list, overlap_limit = 0.9) {
  # Determine which pathways are similar.
  n <- length(pathway_list)
  similar_to <- diag(FALSE, n)
  for(i in 2:n) {
    for(j in 1:(i - 1)) {
      similar_to[i, j] <- 
        (length(intersect(pathway_list[[i]], pathway_list[[j]])) /
           length(union(pathway_list[[i]], pathway_list[[j]]))) > overlap_limit
    }
  }
  backup <- similar_to
  similar_to <- backup
  pathways_to_remove <- NULL
  for(i in 1:ncol(similar_to)) {
    if(sum(similar_to[, i]) > 1) {
      index <- which(similar_to[, i])
      names(pathway_list)[i] <- paste0(names(pathway_list)[i], " (See also: ",
                                       paste0(names(pathway_list)[index],
                                              collapse = "; "), ")")
      
      pathways_to_remove <- c(pathways_to_remove, index)
    }
  }
  pathways_to_remove <- unique(pathways_to_remove)
  return(pathway_list[-pathways_to_remove])
}