R/importexport.R

Defines functions makeTmodFromDataFrame tmod2DataFrame .tmod2DataFrameRowsFeatures .tmod2DataFrameRowsModule tmodImportMSigDB .importMsigDBGMT .importMsigDBXML

Documented in makeTmodFromDataFrame tmod2DataFrame tmodImportMSigDB

## imports the XML format of MSigDB
.importMsigDBXML <- function(file, fields, organism) {
  msig <- list()

  fields <- unique(c("SYSTEMATIC_NAME", "STANDARD_NAME", "CATEGORY_CODE", "SUB_CATEGORY_CODE", fields))
  field.names <- c( "ID", "Title", "Category", "Subcategory", fields[-c(1:4)] )

  foo <- xmlParse(file)
  foo <- xmlToList(foo)
  
  if(organism != "all") {
    orgs <- sapply(foo, function(x) x["ORGANISM"])
    foo <- foo[ orgs == organism ]
  }

  # remove NULLs
  foo <- foo[ ! sapply(foo, is.null) ]

  msig$gs <- t(sapply(foo, function(x) x[ fields ]))

  colnames(msig$gs) <- field.names
  msig$gs <- as_tibble(msig$gs)

  if(any(duplicated(msig$gs$ID))) {
    warning("Duplicated IDs found; automatic IDs will be generated")
    msig$gs$oldID <- msig$gs$ID
    msig$gs$ID    <- make.unique(as.character(msig$gs$ID))
  }

  #rownames(msig$gs) <- msig$MODULES[,"ID"]

  msig$gs2gv <- sapply(foo, function(x) x[ "MEMBERS_SYMBOLIZED" ])
  msig$gs2gv <- strsplit(msig$gs2gv, ",")

  msig$gv <- unique(unlist(msig$gs2gv))
  msig$gs2gv <- lapply(msig$gs2gv, function(x) match(x, msig$gv))

  #names(msig$gs2gv) <- msig$gs$ID

  as_tmodGS(msig)
}


## imports the GMT format of MSigDB
.importMsigDBGMT <- function(file) {
  stop("This does not work at the present.")
  msig <- list()

  con <- file(file, open="r")
  lines <- readLines(con)
  close(con)

  ids   <- gsub( "\t.*", "", lines)
  desc  <- gsub( "^[^\t]*\t([^\t]*)\t.*", "\\1", lines )
  genes <- gsub( "^[^\t]*\t[^\t]*\t(.*)", "\\1", lines )

  msig$MODULES <- data.frame(
    ID=ids, Title=desc, stringsAsFactors=FALSE)
  if(any(duplicated(msig$MODULES$ID))) {
    warning("Duplicated IDs found; automatic IDs will be generated")
    msig$MODULES$oldID <- msig$MODULES$ID
    msig$MODULES$ID    <- make.unique(as.character(msig$MODULES$ID))
  }

  rownames(msig$MODULES) <- msig$MODULES[,"ID"]

  msig$MODULES2GENES <- strsplit(genes, "\t")
  names(msig$MODULES2GENES) <- ids

  msig$GENES <- data.frame( ID=unique(unlist(msig$MODULES2GENES)))
  msig <- new("tmod", msig)
  msig
}



#' Import data from MSigDB
#'
#' Import data from an MSigDB file in either XML or GMT format
#'
#' This command parses a file from MSigDB. Both XML and the MSigDB-specific
#' "GMT" format are supported (however, the latter is discouraged, as it
#' contains less information).
#' @param file The name of the file to parse
#' @param format Format (either "xml" or "gmt")
#' @param organism Select the organism to use. Use "all" for all organisms in the file (only for "xml" format; default: "Homo sapiens")
#' @param fields Which fields to import to the MODULES data frame (only for "xml" format)
#' @return A tmod object
#' @importFrom XML xmlParse xmlToList
#' @importFrom tibble as_tibble
#' @examples
#' \dontrun{
#' ## First, download the file "msigdb_v7.5.1.xml" 
#' ## from http://www.broadinstitute.org/gsea/downloads.jsp
#' msig <- tmodImportMSigDB("msigdb_v7.5.1.xml")
#' }
#' @export

tmodImportMSigDB <- function( file=NULL, format="xml", organism="Homo sapiens",
  fields=c( "STANDARD_NAME", "CATEGORY_CODE", "SUB_CATEGORY_CODE", "EXACT_SOURCE", "EXTERNAL_DETAILS_URL") ) {

  if(length(file) != 1) stop("Incorrect file parameter")
  if(!file.exists(file)) stop( sprintf("File %s does not exist", file))

  format <- match.arg(format, c( "xml", "gmt"))
  msig <- switch(format,
    xml=.importMsigDBXML(file, fields, organism),
    gmt=.importMsigDBGMT(file))

  s <- msig$gs$Title
  msig$gs$Title <- paste0(toupper(substring(s, 1,1)), tolower(substring(s, 2)) )
  msig$gs$Title <- gsub( "^Gse([0-9])", "GSE\\1", msig$gs$Title )
  msig$gs$Title <- gsub( "_", " ", msig$gs$Title )

  msig$gs$B <- sapply(msig$gs2gv, length)
  msig
}


.tmod2DataFrameRowsModule <- function(mset, module_col, feature_col, sep) {
  ret <- mset$gs
  ret[ , feature_col ] <- sapply(mset$gs2gv, function(x) paste(mset$gv[ x ], collapse=sep))
  colnames(ret)[ colnames(ret) == "ID" ] <- module_col
  ret
}

.tmod2DataFrameRowsFeatures <- function(mset, module_col, feature_col, sep) {
  ret <- tibble(ID=mset$gv)
  colnames(ret) <- feature_col

  gv2gs <- split(rep(mset$gs$ID, lengths(mset$gs2gv)), mset$gv[ unlist(mset$gs2gv) ])
  ret[ , module_col ] <- sapply(gv2gs, function(x) paste(x, collapse=sep))
  ret
}


#' Convert a tmod module set into a data frame
#'
#' Convert a tmod module set into a data frame
#'
#' @param mset a tmod object (e.g. generated by makeTmod)
#' @param rows if "modules", then there will be a row corresponding to each
#' module (gene set); if "features", then there will be a row corresponding to
#' each gene.
#' @param module_col Name of the column with module (gene set) IDs
#' @param feature_col Name of the column with feature (gene) IDs
#' @param sep separator used to collate module IDs (if rows=="features") or feature IDs (if rows=="modules")
#' @seealso \code{\link{makeTmodGS}}, \code{\link{makeTmod}}
#' @export
tmod2DataFrame <- function(mset, rows="modules", module_col="module_id", feature_col="feature_id", sep=",") {
  mset <- .getmodules_gs(NULL, mset)

  rows <- match.arg(rows, c("modules", "features"))

  ret <- switch(rows, 
    modules  = .tmod2DataFrameRowsModule(mset, module_col, feature_col, sep),
    features = .tmod2DataFrameRowsFeatures(mset, module_col, feature_col, sep)
    )

  return(ret)
}



#' Convert a data frame to a tmod object
#'
#' Convert a data frame to a tmod object
#'
#' `makeTmodFromFeatureDataFrame` converts mapping information from features (genes) to modules (gene
#' sets). The data frame has a row for each feature-module pair.
#' 
#' `makeTmodFromModuleDataFrame` converts mapping information from features
#' (genes) to modules (gene sets). The data frame has a row for each module,
#' and all gene IDs corresponding to a module are stored as a comma separated string, e.g.
#'        
#' Vice versa, `tmod2DataFrame` converts a tmod object to a data frame.        


#' @param df A data frame
#' @param feature_col Which column contains the feature (gene) IDs
#' @param module_col Which column contains the module (gene set) IDs
#' @param title_col Description of the modules (if NULL, the description will
#'        be taken from the module_col)
#' @param extra_module_cols Additional columns to include in the module data frame
#' @param extra_gene_cols Additional gene columns to include in the genes data frame
#' @seealso \code{\link{makeTmodGS}}, \code{\link{makeTmodGS}}
#' @return A tmod object
#' @examples
#' df <- data.frame(
#' gene_id=LETTERS[1:10],
#' geneset_id=rep(letters[1:2], each=5),
#' geneset_description=rep(paste0("Gene set ", letters[1:2]), each=5))
#' res <- makeTmodFromDataFrame(df, 
#'   feature_col="gene_id", 
#'   module_col="geneset_id",
#'   title_col="geneset_description")
#' @export
makeTmodFromDataFrame <- function(df, feature_col=1, module_col=2, title_col=NULL, extra_module_cols=NULL, extra_gene_cols=NULL) {
  if(!is.data.frame(df)) stop("df must be a data.frame")
  df <- df[ !is.na(df[, feature_col]) & !is.na(df[, module_col]), ] 
  df <- as.data.frame(df) ## in case it is a tibble

  df_unique <- df[ !duplicated(df[, module_col ]), ]

  mods <- tibble(ID=df_unique[ , module_col ])

  if(is.null(title_col)) {
    title_col <- module_col
  }

  mods[ , "Title"] <- df_unique[ , title_col ]
  if(!is.null(extra_module_cols)) {
    extr <- df_unique[ , extra_module_cols, drop=FALSE ]
    mods <- cbind(mods, extr)
  }

  #m2g <- lapply(mods[ , "ID" ], function(m) df[ df[ , module_col ] == m, feature_col])
  gv <- unique(df[ , feature_col ])
  df[ , feature_col ] <- match(df[ , feature_col ], gv)
  gs2gv <- tapply(df[ , feature_col ], df[ , module_col ], unique, simplify=FALSE)

  ## convert from array list to list
  gs2gv <- lapply(gs2gv, function(.) .)
  mods <- mods[ match(names(gs2gv), mods$ID), ]


  message("making Tmod")
  as_tmodGS(list(gs=mods, gs2gv=gs2gv, gv=gv))
}

Try the tmod package in your browser

Any scripts or data that you put into this service are public.

tmod documentation built on March 31, 2023, 9 p.m.