R/importexport.R
In tmod: Feature Set Enrichment Analysis for Metabolomics and Transcriptomics

Documented in makeTmodFromDataFrame tmod2DataFrame tmodImportMSigDB

## imports the XML format of MSigDB
.importMsigDBXML <- function(file, fields, organism) {
  msig <- list()

  fields <- unique(c("SYSTEMATIC_NAME", "STANDARD_NAME", "CATEGORY_CODE", "SUB_CATEGORY_CODE", fields))
  field.names <- c( "ID", "Title", "Category", "Subcategory", fields[-c(1:4)] )

  foo <- xmlParse(file)
  foo <- xmlToList(foo)
  
  if(organism != "all") {
    orgs <- sapply(foo, function(x) x["ORGANISM"])
    foo <- foo[ orgs == organism ]
  }

  # remove NULLs
  foo <- foo[ ! sapply(foo, is.null) ]

  msig$gs <- t(sapply(foo, function(x) x[ fields ]))

  colnames(msig$gs) <- field.names
  msig$gs <- as_tibble(msig$gs)

  if(any(duplicated(msig$gs$ID))) {
    warning("Duplicated IDs found; automatic IDs will be generated")
    msig$gs$oldID <- msig$gs$ID
    msig$gs$ID    <- make.unique(as.character(msig$gs$ID))
  }

  #rownames(msig$gs) <- msig$MODULES[,"ID"]

  msig$gs2gv <- sapply(foo, function(x) x[ "MEMBERS_SYMBOLIZED" ])
  msig$gs2gv <- strsplit(msig$gs2gv, ",")

  msig$gv <- unique(unlist(msig$gs2gv))
  msig$gs2gv <- lapply(msig$gs2gv, function(x) match(x, msig$gv))

  #names(msig$gs2gv) <- msig$gs$ID

  as_tmodGS(msig)
}


## imports the GMT format of MSigDB
.importMsigDBGMT <- function(file) {
  stop("This does not work at the present.")
  msig <- list()

  con <- file(file, open="r")
  lines <- readLines(con)
  close(con)

  ids   <- gsub( "\t.*", "", lines)
  desc  <- gsub( "^[^\t]*\t([^\t]*)\t.*", "\\1", lines )
  genes <- gsub( "^[^\t]*\t[^\t]*\t(.*)", "\\1", lines )

  msig$MODULES <- data.frame(
    ID=ids, Title=desc, stringsAsFactors=FALSE)
  if(any(duplicated(msig$MODULES$ID))) {
    warning("Duplicated IDs found; automatic IDs will be generated")
    msig$MODULES$oldID <- msig$MODULES$ID
    msig$MODULES$ID    <- make.unique(as.character(msig$MODULES$ID))
  }

  rownames(msig$MODULES) <- msig$MODULES[,"ID"]

  msig$MODULES2GENES <- strsplit(genes, "\t")
  names(msig$MODULES2GENES) <- ids

  msig$GENES <- data.frame( ID=unique(unlist(msig$MODULES2GENES)))
  msig <- new("tmod", msig)
  msig
}



#' Import data from MSigDB
#'
#' Import data from an MSigDB file in either XML or GMT format
#'
#' This command parses a file from MSigDB. Both XML and the MSigDB-specific
#' "GMT" format are supported (however, the latter is discouraged, as it
#' contains less information).
#' @param file The name of the file to parse
#' @param format Format (either "xml" or "gmt")
#' @param organism Select the organism to use. Use "all" for all organisms in the file (only for "xml" format; default: "Homo sapiens")
#' @param fields Which fields to import to the MODULES data frame (only for "xml" format)
#' @return A tmod object
#' @importFrom XML xmlParse xmlToList
#' @importFrom tibble as_tibble
#' @examples
#' \dontrun{
#' ## First, download the file "msigdb_v7.5.1.xml" 
#' ## from http://www.broadinstitute.org/gsea/downloads.jsp
#' msig <- tmodImportMSigDB("msigdb_v7.5.1.xml")
#' }
#' @export

tmodImportMSigDB <- function( file=NULL, format="xml", organism="Homo sapiens",
  fields=c( "STANDARD_NAME", "CATEGORY_CODE", "SUB_CATEGORY_CODE", "EXACT_SOURCE", "EXTERNAL_DETAILS_URL") ) {

  if(length(file) != 1) stop("Incorrect file parameter")
  if(!file.exists(file)) stop( sprintf("File %s does not exist", file))

  format <- match.arg(format, c( "xml", "gmt"))
  msig <- switch(format,
    xml=.importMsigDBXML(file, fields, organism),
    gmt=.importMsigDBGMT(file))

  s <- msig$gs$Title
  msig$gs$Title <- paste0(toupper(substring(s, 1,1)), tolower(substring(s, 2)) )
  msig$gs$Title <- gsub( "^Gse([0-9])", "GSE\\1", msig$gs$Title )
  msig$gs$Title <- gsub( "_", " ", msig$gs$Title )

  msig$gs$B <- sapply(msig$gs2gv, length)
  msig
}


.tmod2DataFrameRowsModule <- function(mset, module_col, feature_col, sep) {
  ret <- mset$gs
  ret[ , feature_col ] <- sapply(mset$gs2gv, function(x) paste(mset$gv[ x ], collapse=sep))
  colnames(ret)[ colnames(ret) == "ID" ] <- module_col
  ret
}

.tmod2DataFrameRowsFeatures <- function(mset, module_col, feature_col, sep) {
  ret <- tibble(ID=mset$gv)
  colnames(ret) <- feature_col

  gv2gs <- split(rep(mset$gs$ID, lengths(mset$gs2gv)), mset$gv[ unlist(mset$gs2gv) ])
  ret[ , module_col ] <- sapply(gv2gs, function(x) paste(x, collapse=sep))
  ret
}


#' Convert a tmod module set into a data frame
#'
#' Convert a tmod module set into a data frame
#'
#' @param mset a tmod object (e.g. generated by makeTmod)
#' @param rows if "modules", then there will be a row corresponding to each
#' module (gene set); if "features", then there will be a row corresponding to
#' each gene.
#' @param module_col Name of the column with module (gene set) IDs
#' @param feature_col Name of the column with feature (gene) IDs
#' @param sep separator used to collate module IDs (if rows=="features") or feature IDs (if rows=="modules")
#' @seealso \code{\link{makeTmodGS}}, \code{\link{makeTmod}}
#' @export
tmod2DataFrame <- function(mset, rows="modules", module_col="module_id", feature_col="feature_id", sep=",") {
  mset <- .getmodules_gs(NULL, mset)

  rows <- match.arg(rows, c("modules", "features"))

  ret <- switch(rows, 
    modules  = .tmod2DataFrameRowsModule(mset, module_col, feature_col, sep),
    features = .tmod2DataFrameRowsFeatures(mset, module_col, feature_col, sep)
    )

  return(ret)
}



#' Convert a data frame to a tmod object
#'
#' Convert a data frame to a tmod object
#'
#' `makeTmodFromFeatureDataFrame` converts mapping information from features (genes) to modules (gene
#' sets). The data frame has a row for each feature-module pair.
#' 
#' `makeTmodFromModuleDataFrame` converts mapping information from features
#' (genes) to modules (gene sets). The data frame has a row for each module,
#' and all gene IDs corresponding to a module are stored as a comma separated string, e.g.
#'        
#' Vice versa, `tmod2DataFrame` converts a tmod object to a data frame.        


#' @param df A data frame
#' @param feature_col Which column contains the feature (gene) IDs
#' @param module_col Which column contains the module (gene set) IDs
#' @param title_col Description of the modules (if NULL, the description will
#'        be taken from the module_col)
#' @param extra_module_cols Additional columns to include in the module data frame
#' @param extra_gene_cols Additional gene columns to include in the genes data frame
#' @seealso \code{\link{makeTmodGS}}, \code{\link{makeTmodGS}}
#' @return A tmod object
#' @examples
#' df <- data.frame(
#' gene_id=LETTERS[1:10],
#' geneset_id=rep(letters[1:2], each=5),
#' geneset_description=rep(paste0("Gene set ", letters[1:2]), each=5))
#' res <- makeTmodFromDataFrame(df, 
#'   feature_col="gene_id", 
#'   module_col="geneset_id",
#'   title_col="geneset_description")
#' @export
makeTmodFromDataFrame <- function(df, feature_col=1, module_col=2, title_col=NULL, extra_module_cols=NULL, extra_gene_cols=NULL) {
  if(!is.data.frame(df)) stop("df must be a data.frame")
  df <- df[ !is.na(df[, feature_col]) & !is.na(df[, module_col]), ] 
  df <- as.data.frame(df) ## in case it is a tibble

  df_unique <- df[ !duplicated(df[, module_col ]), ]

  mods <- tibble(ID=df_unique[ , module_col ])

  if(is.null(title_col)) {
    title_col <- module_col
  }

  mods[ , "Title"] <- df_unique[ , title_col ]
  if(!is.null(extra_module_cols)) {
    extr <- df_unique[ , extra_module_cols, drop=FALSE ]
    mods <- cbind(mods, extr)
  }

  #m2g <- lapply(mods[ , "ID" ], function(m) df[ df[ , module_col ] == m, feature_col])
  gv <- unique(df[ , feature_col ])
  df[ , feature_col ] <- match(df[ , feature_col ], gv)
  gs2gv <- tapply(df[ , feature_col ], df[ , module_col ], unique, simplify=FALSE)

  ## convert from array list to list
  gs2gv <- lapply(gs2gv, function(.) .)
  mods <- mods[ match(names(gs2gv), mods$ID), ]


  message("making Tmod")
  as_tmodGS(list(gs=mods, gs2gv=gs2gv, gv=gv))
}

Any scripts or data that you put into this service are public.

tmod documentation built on March 31, 2023, 9 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tmod
Feature Set Enrichment Analysis for Metabolomics and Transcriptomics

R/importexport.R
In tmod: Feature Set Enrichment Analysis for Metabolomics and Transcriptomics

Defines functions makeTmodFromDataFrame tmod2DataFrame .tmod2DataFrameRowsFeatures .tmod2DataFrameRowsModule tmodImportMSigDB .importMsigDBGMT .importMsigDBXML

Documented in makeTmodFromDataFrame tmod2DataFrame tmodImportMSigDB

Try the tmod package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tmod Feature Set Enrichment Analysis for Metabolomics and Transcriptomics

R/importexport.R In tmod: Feature Set Enrichment Analysis for Metabolomics and Transcriptomics

Defines functions makeTmodFromDataFrame tmod2DataFrame .tmod2DataFrameRowsFeatures .tmod2DataFrameRowsModule tmodImportMSigDB .importMsigDBGMT .importMsigDBXML

Documented in makeTmodFromDataFrame tmod2DataFrame tmodImportMSigDB

Try the tmod package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tmod
Feature Set Enrichment Analysis for Metabolomics and Transcriptomics

R/importexport.R
In tmod: Feature Set Enrichment Analysis for Metabolomics and Transcriptomics