R/importexport.R

Defines functions makeTmodFromDataFrame tmod2DataFrame .tmod2DataFrameRowsFeatures .tmod2DataFrameRowsModule tmodImportMSigDB .importMsigDBGMT .importMsigDBXML

Documented in makeTmodFromDataFrame tmod2DataFrame tmodImportMSigDB

## imports the XML format of MSigDB
.importMsigDBXML <- function(file, fields, organism) {
  msig <- list()

  fields <- unique(c("SYSTEMATIC_NAME", "STANDARD_NAME", "CATEGORY_CODE", "SUB_CATEGORY_CODE", fields))
  field.names <- c( "ID", "Title", "Category", "Subcategory", fields[-c(1:4)] )

  foo <- xmlParse(file)
  foo <- xmlToList(foo)
  
  if(organism != "all") {
    orgs <- sapply(foo, function(x) x["ORGANISM"])
    foo <- foo[ orgs == organism ]
  }

  # remove NULLs
  foo <- foo[ ! sapply(foo, is.null) ]

  msig$MODULES <- t(sapply(foo,
    function(x) x[ fields ]))
  colnames(msig$MODULES) <- field.names
  msig$MODULES <- data.frame(msig$MODULES, stringsAsFactors=FALSE, row.names=NULL)

  if(any(duplicated(msig$MODULES$ID))) {
    warning("Duplicated IDs found; automatic IDs will be generated")
    msig$MODULES$oldID <- msig$MODULES$ID
    msig$MODULES$ID    <- make.unique(as.character(msig$MODULES$ID))
  }

  rownames(msig$MODULES) <- msig$MODULES[,"ID"]

  msig$MODULES2GENES <- lapply(foo, function(x) strsplit( x["MEMBERS_SYMBOLIZED"], "," )[[1]])

  names(msig$MODULES2GENES) <- msig$MODULES$ID
  msig$GENES <- data.frame( ID=unique(unlist(msig$MODULES2GENES)))

  msig <- new("tmod", msig)
  msig
}


## imports the GMT format of MSigDB
.importMsigDBGMT <- function(file) {
  msig <- list()

  con <- file(file, open="r")
  lines <- readLines(con)
  close(con)

  ids   <- gsub( "\t.*", "", lines)
  desc  <- gsub( "^[^\t]*\t([^\t]*)\t.*", "\\1", lines )
  genes <- gsub( "^[^\t]*\t[^\t]*\t(.*)", "\\1", lines )

  msig$MODULES <- data.frame(
    ID=ids, Title=desc, stringsAsFactors=FALSE)
  if(any(duplicated(msig$MODULES$ID))) {
    warning("Duplicated IDs found; automatic IDs will be generated")
    msig$MODULES$oldID <- msig$MODULES$ID
    msig$MODULES$ID    <- make.unique(as.character(msig$MODULES$ID))
  }

  rownames(msig$MODULES) <- msig$MODULES[,"ID"]

  msig$MODULES2GENES <- strsplit(genes, "\t")
  names(msig$MODULES2GENES) <- ids

  msig$GENES <- data.frame( ID=unique(unlist(msig$MODULES2GENES)))
  msig <- new("tmod", msig)
  msig
}



#' Import data from MSigDB
#'
#' Import data from an MSigDB file in either XML or GMT format
#'
#' This command parses a file from MSigDB. Both XML and the MSigDB-specific
#' "GMT" format are supported (however, the latter is discouraged, as it
#' contains less information).
#' @param file The name of the file to parse
#' @param format Format (either "xml" or "gmt")
#' @param organism Select the organism to use. Use "all" for all organisms in the file (only for "xml" format; default: "Homo sapiens")
#' @param fields Which fields to import to the MODULES data frame (only for "xml" format)
#' @return A tmod object
#' @importFrom XML xmlParse xmlToList
#' @examples
#' \dontrun{
#' ## First, download the file "msigdb_v5.0.xml" from http://www.broadinstitute.org/gsea/downloads.jsp
#' msig <- tmodImportMSigDB( "msigdb_v5.0.xml" )
#' }
#' @export

tmodImportMSigDB <- function( file=NULL, format="xml", organism="Homo sapiens",
  fields=c( "STANDARD_NAME", "CATEGORY_CODE", "SUB_CATEGORY_CODE", "EXTERNAL_DETAILS_URL") ) {

  if(length(file) != 1) stop("Incorrect file parameter")
  if(!file.exists(file)) stop( sprintf("File %s does not exist", file))

  format <- match.arg(format, c( "xml", "gmt"))
  msig <- switch(format,
    xml=.importMsigDBXML(file, fields, organism),
    gmt=.importMsigDBGMT(file))

  s <- msig$MODULES$Title
  msig$MODULES$Title <- paste0(toupper(substring(s, 1,1)), tolower(substring(s, 2)) )
  msig$MODULES$Title <- gsub( "^Gse([0-9])", "GSE\\1", msig$MODULES$Title )
  msig$MODULES$Title <- gsub( "_", " ", msig$MODULES$Title )

  msig$MODULES$B <- sapply(msig$MODULES2GENES, length)
  msig
}


.tmod2DataFrameRowsModule <- function(mset, module_col, feature_col, sep) {
  ret <- mset$MODULES
  ret[ , feature_col ] <- sapply(mset$MODULES2GENES, function(x) paste(x, collapse=sep))
  colnames(ret)[ colnames(ret) == "ID" ] <- module_col
  ret
}

.tmod2DataFrameRowsFeatures <- function(mset, module_col, feature_col, sep) {
  ret <- mset$GENES

  if(is.null(mset$GENES2MODULES)) {
    mset$GENES2MODULES <- .invert_hash(mset$MODULES2GENES)
  }
  colnames(ret)[ colnames(ret) == "ID" ] <- feature_col
  ret[ , module_col ] <- sapply(mset$GENES2MODULES, function(x) paste(x, collapse=sep))
  ret
}


#' Convert a tmod module set into a data frame
#'
#' Convert a tmod module set into a data frame
#'
#' @param mset a tmod object (e.g. generated by makeTmod)
#' @param rows if "modules", then there will be a row corresponding to each
#' module (gene set); if "features", then there will be a row corresponding to
#' each gene.
#' @param module_col Name of the column with module (gene set) IDs
#' @param feature_col Name of the column with feature (gene) IDs
#' @param sep separator used to collate module IDs (if rows=="features") or feature IDs (if rows=="modules")
#' @seealso \code{\link{tmod-class}}, \code{\link{makeTmod}}
#' @export
tmod2DataFrame <- function(mset, rows="modules", module_col="module_id", feature_col="feature_id", sep=",") {
  mset <- .getmodules2(NULL, mset)

  rows <- match.arg(rows, c("modules", "features"))

  ret <- switch(rows, 
    modules  = .tmod2DataFrameRowsModule(mset, module_col, feature_col, sep),
    features = .tmod2DataFrameRowsFeatures(mset, module_col, feature_col, sep)
    )

  return(ret)
}



#' Convert a data frame to a tmod object
#'
#' Convert a data frame to a tmod object
#'
#' `makeTmodFromFeatureDataFrame` converts mapping information from features (genes) to modules (gene
#' sets). The data frame has a row for each feature-module pair.
#' 
#' `makeTmodFromModuleDataFrame` converts mapping information from features
#' (genes) to modules (gene sets). The data frame has a row for each module,
#' and all gene IDs corresponding to a module are stored as a comma separated string, e.g.
#'        
#' Vice versa, `tmod2DataFrame` converts a tmod object to a data frame.        


#' @param df A data frame
#' @param feature_col Which column contains the feature (gene) IDs
#' @param module_col Which column contains the module (gene set) IDs
#' @param title_col Description of the modules (if NULL, the description will
#'        be taken from the module_col)
#' @param extra_module_cols Additional columns to include in the module data frame
#' @param extra_gene_cols Additional gene columns to include in the genes data frame
#' @seealso \code{\link{tmod-class}}, \code{\link{makeTmod}}
#' @return A tmod object
#' @examples
#' df <- data.frame(
#' gene_id=LETTERS[1:10],
#' geneset_id=rep(letters[1:2], each=5),
#' geneset_description=rep(paste0("Gene set ", letters[1:2]), each=5))
#' res <- makeTmodFromDataFrame(df, 
#'   feature_col="gene_id", 
#'   module_col="geneset_id",
#'   title_col="geneset_description")
#' @export
makeTmodFromDataFrame <- function(df, feature_col=1, module_col=2, title_col=NULL, extra_module_cols=NULL, extra_gene_cols=NULL) {
  if(!is.data.frame(df)) stop("df must be a data.frame")
  df <- df[ !is.na(df[, feature_col]) & !is.na(df[, module_col]), ] 
  df <- as.data.frame(df) ## if it were a tibble...

  df_unique <- df[ !duplicated(df[, module_col ]), ]
  mods <- data.frame(ID=df_unique[ , module_col ], stringsAsFactors=FALSE)

  #m2g <- lapply(mods[ , "ID" ], function(m) df[ df[ , module_col ] == m, feature_col])
  m2g <- tapply(df[ , feature_col ], df[ , module_col ], unique)
  m2g <- lapply(m2g, function(x) x)

  if(is.null(title_col)) {
    title_col <- module_col
  }
  mods[ , "Title"] <- df_unique[ , title_col ]
  if(!is.null(extra_module_cols)) {
    mods <- cbind(mods, df_unique[ , extra_module_cols ])
  }

  message("unlisting m2g")
  g_ids <- unique(unlist(m2g))
  gens <- data.frame(ID=g_ids, stringsAsFactors=FALSE)

  if(!is.null(extra_gene_cols)) {
    df_matched <- df[ match(g_ids, df[ , feature_col ]), ]
    gens <- cbind(gens, df_matched[, extra_gene_cols])
  }

  message("making Tmod")
  makeTmod(modules=mods, modules2genes=m2g, genes=gens)
}

Try the tmod package in your browser

Any scripts or data that you put into this service are public.

tmod documentation built on Oct. 23, 2020, 6:12 p.m.