R/matchGenes.R

Defines functions matchGenes.signedDefault matchGenes.default matchGeneExclNA DGEListSymbols eSetSymbols getSymbols

##----------------------------------------##
## Helper functions
##----------------------------------------##
getSymbols <- function(exprsMatrixRowNames, featureAnno, col="GeneSymbol") {
   if(is.null(featureAnno)) {
     col <- NULL
   } else {
     if (!is.null(col) && !col %in% colnames(featureAnno)) {
       stop("'col' not found in the column names of feature annotation")
     }
   }
  
  if (is.null(col)) {
    symbols <- exprsMatrixRowNames
  } else {
    symbols <- featureAnno[, col]
  }
  symbols <- as.character(symbols)
  return(symbols)
}

#' @importFrom Biobase fData featureNames
eSetSymbols <- function(object, col="GeneSymbol") {
  getSymbols(featureNames(object), fData(object), col=col)
}

#' @importClassesFrom edgeR DGEList
DGEListSymbols <- function(object, col="GeneSymbol") {
  getSymbols(rownames(object$counts), object$genes, col=col)
}

##----------------------------------------##
## matchGenes for GmtList
##----------------------------------------##
matchGeneExclNA <- function(inputGene, allGene) {
    if(is.null(inputGene)) return(NULL)
    res <- match(inputGene, allGene)
    res[is.na(inputGene)] <- NA ## make sure that NA does not match to NA
    return(res)
}

#' @importFrom methods is
matchGenes.default <- function(gmtList,geneSymbols) {
    if(!is(gmtList, "GmtList"))
        stop(paste("gmtlist be must of class GmtList; now it is", class(gmtList)))
    if(!is.character(geneSymbols))
        stop(paste("geneSymbols be must characters; now it is", class(geneSymbols)))
    genes <- lapply(gmtList, function(x) x$genes)
    names(genes) <- sapply(gmtList, function(x) x$name)
    indList <- lapply(genes, matchGeneExclNA, allGene=geneSymbols)
    res <- IndexList(indList)
    return(res)
}
#' Match genes in a list-like object to a vector of genesymbols
#'
#' @param list A GmtList, list, character or SignedGenesets object
#' @param object Gene symbols to be matched; they can come from a vector of character strings, or
#' a column in the fData of an \code{eSet} object.
#' @param ... additional arguments like \code{col}
#' @param col Column name of \code{fData} in an \code{eSet} object, or \code{genes} in an \code{DGEList} object, to specify where gene symbols are stored.
#' The default value is set to "GeneSymbol"
#' 
#' @return An \code{IndexList} object, which is essentially a list of the same length as input (length of \code{1} in case characters are used as input), with matching indices.
#' 
#' @name matchGenes
#' @examples
#' ## test GmtList, character
#' testGenes <- sprintf("gene%d", 1:10)
#' testGeneSets <- GmtList(list(gs1=c("gene1", "gene2"), gs2=c("gene9", "gene10"), gs3=c("gene100")))
#' matchGenes(testGeneSets, testGenes)
#' 
#' ## test GmtList, matrix
#' testGenes <- sprintf("gene%d", 1:10)
#' testGeneSets <- GmtList(list(gs1=c("gene1", "gene2"), gs2=c("gene9", "gene10"), gs3=c("gene100")))
#' testGeneExprs <- matrix(rnorm(100), nrow=10, dimnames=list(testGenes, sprintf("sample%d", 1:10)))
#' matchGenes(testGeneSets, testGeneExprs)
#' 
#' ## test GmtList, eSet
#' testGenes <- sprintf("gene%d", 1:10)
#' testGeneSets <- GmtList(list(gs1=c("gene1", "gene2"), gs2=c("gene9", "gene10"), gs3=c("gene100")))
#' testGeneExprs <- matrix(rnorm(100), nrow=10, dimnames=list(testGenes, sprintf("sample%d", 1:10)))
#' testFeat <- data.frame(GeneSymbol=rownames(testGeneExprs), row.names=testGenes)
#' testPheno <- data.frame(SampleId=colnames(testGeneExprs), row.names=colnames(testGeneExprs))
#' testEset <- ExpressionSet(assayData=testGeneExprs,
#'     featureData=AnnotatedDataFrame(testFeat),
#'     phenoData=AnnotatedDataFrame(testPheno))
#' matchGenes(testGeneSets, testGeneExprs)
#' ## force using row names
#' matchGenes(testGeneSets, testEset, col=NULL)
#' 
#'  ## test GmtList, DGEList
#'  if(requireNamespace("edgeR")) {
#'     mat <- matrix(rnbinom(100, mu=5, size=2), ncol=10)
#'     rownames(mat) <- sprintf("gene%d", 1:nrow(mat))
#'     y <- edgeR::DGEList(counts=mat, group=rep(1:2, each=5))
#'
#'     ## if genes are not set, row names of the count matrix will be used for lookup
#'     myGeneSet <- GmtList(list(gs1=rownames(mat)[1:2], gs2=rownames(mat)[9:10], gs3="gene100"))
#'     matchGenes(myGeneSet, y)
#'
#'     matchGenes(c("gene1", "gene2"), y)
#'     ## alternatively, use 'col' parameter to specify the column in 'genes'
#'     y2 <- edgeR::DGEList(counts=mat,
#'       group=rep(1:2, each=5),
#'       genes=data.frame(GeneIdentifier=rownames(mat), row.names=rownames(mat)))
#'     matchGenes(myGeneSet, y2, col="GeneIdentifier")
#'  }
#' 
#' ## test character, character
#' matchGenes(c("gene1", "gene2"), testGenes)
#' 
#' ## test character, matrix
#' matchGenes(c("gene1", "gene2"), testGeneExprs)
#' 
#' ## test character, eset
#' matchGenes(c("gene1", "gene2"), testEset)
NULL

#'@rdname matchGenes
#'@aliases matchGenes,GmtList,character-method
setMethod("matchGenes", c("GmtList", "character"), function(list, object) {
              matchGenes.default(list, object)
          })

#'@rdname matchGenes
#'@aliases matchGenes,GmtList,matrix-method
setMethod("matchGenes", c("GmtList", "matrix"), function(list, object) {
              if(is.null(rownames(object)))
                  stop("When used to map genes in GmtList directly to rows in matrix, the matrix's row names must be gene symbols")
              symbols <- rownames(object)
              matchGenes.default(list, as.character(symbols))
          })


#'@rdname matchGenes
#'@aliases matchGenes,GmtList,eSet-method
setMethod("matchGenes", c("GmtList", "eSet"), function(list, object, col="GeneSymbol") {
              symbols <- eSetSymbols(object, col=col)
              matchGenes.default(list, as.character(symbols))
          })

#'@rdname matchGenes
#'@aliases matchGenes,character,character-method
setMethod("matchGenes", c("character", "character"), function(list, object) {
              tempList <- GmtList(list(TempGeneSet=list))
              matchGenes.default(tempList, object)
          })
#'@rdname matchGenes
#'@aliases matchGenes,character,matrix-method
setMethod("matchGenes", c("character", "matrix"), function(list, object) {
              tempList <- GmtList(list(TempGeneSet=list))
              matchGenes(tempList, object)
          })
#'@rdname matchGenes
#'@aliases matchGenes,character,eSet-method
setMethod("matchGenes", c("character", "eSet"), function(list, object) {
              tempList <- GmtList(list(TempGeneSet=list))
              matchGenes(tempList, object)
          })

#'@rdname matchGenes
#'@aliases matchGenes,character,DGEList-method
setMethod("matchGenes", c("character", "DGEList"), function(list, object, col="GeneSymbol") {
    tempList <- GmtList(list(TempGeneSet=list))
    matchGenes(tempList, object)
})

#'@rdname matchGenes
#'@aliases matchGenes,GmtList,DGEList-method
setMethod("matchGenes", c("GmtList", "DGEList"), function(list, object, col="GeneSymbol") {
  symbols <- DGEListSymbols(object, col=col)
  matchGenes.default(list, symbols)
})

##----------------------------------------##
## matchGenes for SignedGenesets
##----------------------------------------##
#' @importFrom methods is
matchGenes.signedDefault <- function(signedGenesets, geneSymbols) {
    if(!is(signedGenesets, "SignedGenesets"))
        stop(paste("signedGenesets be must of class SignedGenesets; now it is", class(signedGenesets)))
    resList <- lapply(signedGenesets, function(geneset) {
        pos <- geneset$pos
        neg <- geneset$neg
        posInd <- matchGeneExclNA(geneset$pos, geneSymbols)
        negInd <- matchGeneExclNA(geneset$neg, geneSymbols)
        return(list(pos=posInd, neg=negInd))
    })
    names(resList) <- sapply(signedGenesets, function(x) x$name)
    res <- SignedIndexList(resList)
    return(res)
}

#'@rdname matchGenes
#'@aliases matchGenes,SignedGenesets,character-method
setMethod("matchGenes", c("SignedGenesets", "character"), function(list, object) {
              matchGenes.signedDefault(list, object)
          })

#'@rdname matchGenes
#'@aliases matchGenes,SignedGenesets,matrix-method
setMethod("matchGenes", c("SignedGenesets", "matrix"), function(list, object) {
              if(is.null(rownames(object)))
                  stop("When used to map genes in GmtList directly to rows in matrix, the matrix's row names must be gene symbols")
              symbols <- rownames(object)
              matchGenes.signedDefault(list, as.character(symbols))
          })

#'@rdname matchGenes
#'@aliases matchGenes,SignedGenesets,eSet-method
setMethod("matchGenes", c("SignedGenesets", "eSet"), function(list, object, col="GeneSymbol") {
  symbols <- eSetSymbols(object, col=col)
  matchGenes.signedDefault(list, symbols)
})

#'@rdname matchGenes
#'@aliases matchGenes,SignedGenesets,DGEList-method
setMethod("matchGenes", c("SignedGenesets", "DGEList"), function(list, object, col="GeneSymbol") {
  symbols <- DGEListSymbols(object, col=col)
  matchGenes.default(list, symbols)
})
Accio/BioQC documentation built on Jan. 27, 2022, 10:45 p.m.