R/utilities.R
In signatureSearch: Environment for Gene Expression Searching Combined with Functional Enrichment Analysis

Documented in append2H5 create_empty_h5 gctx2h5 getDEGSig getSig getSPsubSig gmt2h5 matrix2h5 read_gmt set_readable tarReduce vec_char_redu

sep_pcf <- function(res){
    new <- as.data.frame(t(vapply(seq_len(nrow(res)), function(i)
        unlist(strsplit(as.character(res$set[i]), "__")), 
        FUN.VALUE=character(3))), stringsAsFactors=FALSE)
    colnames(new) <- c("pert", "cell", "type")
    res <- cbind(new, res[,-1])
    return(res)
}

# readHDF5mat <- function(h5file, colindex=seq_len(10)) {
#     m <- h5read(h5file, "assay", index=list(NULL, colindex))
#     mycol <- h5read(h5file, "colnames", index=list(colindex, 1))
#     myrow <- h5read(h5file, "rownames")
#     h5closeAll()
#     rownames(m) <- as.character(myrow[,1])
#     colnames(m) <- as.character(mycol[,1])
#     return(m)
# }

# getH5dim <- function(h5file){
#     mat_dim <- h5ls(h5file)$dim[1]
#     mat_ncol <- as.numeric(gsub(".* x ","", mat_dim))
#     mat_nrow <- as.numeric(gsub(" x .*","", mat_dim))
#     return(c(mat_nrow, mat_ncol))
# }

#' Write Matrix to HDF5 file
#' 
#' Function writes matrix object to an HDF5 file.
#' @param matrix matrix to be written to HDF5 file, row and column name slots 
#' need to be populated
#' @param h5file character(1), path to the hdf5 destination file
#' @param name The name of the dataset in the HDF5 file. The default is write
#' the score matrix (e.g. z-score, logFC) to the 'assay' dataset, users could 
#' also write the adjusted p-value or FDR matrix to the 'padj' dataset by 
#' setting the \code{name} as 'padj'.
#' @param overwrite TRUE or FALSE, whether to overwrite or append
#' matrix to an existing 'h5file'
#' @return HDF5 file containing exported matrix
#' @examples 
#' mat <- matrix(rnorm(12), nrow=3, dimnames=list(
#'               paste0("r",1:3), paste0("c",1:4)))
#' h5file <- tempfile(fileext=".h5")
#' matrix2h5(matrix=mat, h5file=h5file, overwrite=TRUE)
#' @export
matrix2h5 <- function(matrix, h5file, name="assay", overwrite=TRUE){
    create_empty_h5(h5file, delete_existing=overwrite)
    append2H5(matrix, h5file, name=name)
}

#' Create Empty HDF5 File 
#' 
#' This function can be used to create an empty HDF5 file where the user defines
#' the file path and compression level. The empty HDF5 file has under its root
#' group three data slots named 'assay', 'colnames' and 'rownames' for storing a
#' \code{numeric matrix} along with its column names (\code{character}) and row 
#' names (\code{character}), respectively.
#' 
#' @param h5file character(1), path to the HDF5 file to be created
#' @param delete_existing logical, whether to delete an existing HDF5 file with 
#' identical path
#' @param level The compression level used, here given as integer value between 
#' 0 (no compression) and 9 (highest and slowest compression).
#' @return empty HDF5 file
#' @examples
#' tmp_file <- tempfile(fileext=".h5")
#' create_empty_h5(tmp_file, level=6)
#' @export
create_empty_h5 <- function(h5file, delete_existing=FALSE, level=6) {
    if(delete_existing) unlink(h5file)
    if(! file.exists(h5file)){
        h5createFile(file=h5file)
        h5createDataset(h5file, "assay", c(0,0), c(H5Sunlimited(), H5Sunlimited()), 
                        chunk=c(12328,1), level=level)
        h5createDataset(h5file, "padj", c(0,0), c(H5Sunlimited(), H5Sunlimited()), 
                        chunk=c(12328,1), level=level)
        h5createDataset(h5file, "colnames", c(0,1), c(H5Sunlimited(), 1), 
                        storage.mode='character', size=200, level=level)
        h5createDataset(h5file, "rownames", c(0,1), c(H5Sunlimited(), 1), 
                        storage.mode='character', size=40, level=level)
    }
    h5closeAll()
}

#' Append Matrix to HDF5 File
#' 
#' Function to write matrix data to an existing HDF5 file. If the file contains 
#' already matrix data then both need to have the same number of rows. The 
#' append will be column-wise.
#' @param x matrix object to write to an HDF5 file. If the HDF5 file is not 
#' empty, the exported matrix data needs to have the same number rows as the 
#' matrix stored in the HDF5 file, and will be appended column-wise to the 
#' existing one.
#' @param h5file character(1), path to existing HDF5 file that can be empty or
#' contain matrix data
#' @param name The name of the dataset in the HDF5 file.
#' @param printstatus logical, whether to print status
#' @return HDF5 file storing exported matrix
#' @examples 
#' mat <- matrix(1:12, nrow=3)
#' rownames(mat) <- paste0("r", 1:3); colnames(mat) <- paste0("c", 1:4)
#' tmp_file <- tempfile(fileext=".h5")
#' create_empty_h5(tmp_file)
#' append2H5(mat, tmp_file)
#' rhdf5::h5ls(tmp_file)
#' @export
append2H5 <- function(x, h5file, name="assay", printstatus=TRUE) {
    status <- h5ls(h5file)[c("name", "dim")]
    rowstatus <- as.numeric(gsub(" x \\d{1,}$", "", 
                                 status[status$name==name, "dim"]))
    colstatus <- as.numeric(gsub("^\\d{1,} x ", "", 
                                 status[status$name==name, "dim"]))
    nrows <- nrow(x) 
    ncols <- colstatus + ncol(x)
    h5set_extent(h5file, name, c(nrows, ncols))
    h5write(x, h5file, name, index=list(seq_len(nrows), (colstatus+1):ncols))
    # only stores rownames and colnames of 'assay' dataset
    if(name=="assay"){
        h5set_extent(h5file, "colnames", c(ncols,1))
        h5write(colnames(x), h5file, "colnames", index=list((colstatus+1):ncols, 1))
        if(any(duplicated(h5read(h5file, "colnames")[,1]))) 
            warning("Column names contain duplicates!")
        
        h5set_extent(h5file, "rownames", c(nrows,1))
        h5write(rownames(x), h5file, "rownames", index=list(seq_len(nrows), 1))
        if(any(duplicated(h5read(h5file, "rownames")[,1]))) 
            warning("Row names contain duplicates!")
    }
    
    if(printstatus) h5ls(h5file, all=TRUE)[c("dim", "maxdim")]
    h5closeAll()
}

#' Read matrix-like data from large gctx file in chunks and write result back 
#' to an HDF5 file.
#' @title Convert GCTX to HDF5 File
#' @param gctx character(1), path to gctx file from LINCS
#' @param cid character or integer vector referencing the
#' columns of the matrix to include
#' @param new_cid character vector of the same length as cid, assigning new
#' column names to matrix
#' @param h5file character(1), path of the hdf5 destination file
#' @param by_ncol number of columns to import in each iteration to limit 
#' memory usage
#' @param overwrite TRUE or FALSE, whether to overwrite or to append to 
#' existing 'h5file'
#' @return HDF5 file
#' @import rhdf5
#' @examples 
#' gctx <- system.file("extdata", "test_sample_n2x12328.gctx", 
#'         package="signatureSearch")
#' h5file <- tempfile(fileext=".h5")
#' gctx2h5(gctx, cid=1:2, 
#'         new_cid=c('sirolimus__MCF7__trt_cp', 'vorinostat__SKB__trt_cp'), 
#'         h5file=h5file, overwrite=TRUE)
#' @export
#' 
gctx2h5 <- function(gctx, cid, new_cid=cid, h5file, by_ncol=5000, 
                    overwrite=TRUE){
    cid_list <- suppressWarnings(
        split(cid, rep(seq_len(ceiling(length(cid)/by_ncol)), 
                       each=by_ncol)))
    new_cid_list <- suppressWarnings(
        split(new_cid, rep(seq_len(ceiling(length(new_cid)/by_ncol)), 
                           each=by_ncol)))
    create_empty_h5(h5file, delete_existing=overwrite)
    lapply(seq_along(cid_list), function(i){
        mat <- parse_gctx(gctx, cid=cid_list[[i]], matrix_only=TRUE)
        mat <- mat@mat
        colnames(mat) <- new_cid_list[[i]]
        append2H5(x=mat, h5file, printstatus=FALSE)
    })    
    h5ls(h5file)
}

#' Read gene sets from large gmt file in batches, convert the gene sets to
#' 01 matrix and write the result to an HDF5 file.
#' @title Convert GMT to HDF5 File
#' @param gmtfile character(1), path to gmt file containing gene sets
#' @param dest_h5 character(1), path of the hdf5 destination file
#' @param by_nset number of gene sets to import in each iteration to limit 
#' memory usage
#' @param overwrite TRUE or FALSE, whether to overwrite or to append to 
#' existing 'h5file'
#' @return HDF5 file
#' @examples 
#' gmt <- system.file("extdata", "test_gene_sets_n4.gmt", 
#'         package="signatureSearch")
#' h5file <- tempfile(fileext=".h5")
#' gmt2h5(gmtfile=gmt, dest_h5=h5file, overwrite=TRUE)
#' @export
gmt2h5 <- function(gmtfile, dest_h5, by_nset=5000, overwrite=FALSE){
    # get number of lines of gmtfile
    wc <- system(paste("wc -l", gmtfile), intern = TRUE)
    nline <- as.numeric(gsub(pattern=gmtfile, "", wc,fixed = TRUE))
    ceil <- ceiling(nline/by_nset)
    
    if(file.exists(dest_h5) & !overwrite){
        message(paste(dest_h5, "already exists!"))
    } else {
        create_empty_h5(dest_h5, delete_existing=TRUE)
    }
    
    # get all gene identifiers in gmtfile
    all_genes <- NULL
    for(i in seq_len(ceil)){
        gene_sets <- suppressWarnings(read_gmt(gmtfile, 
                            start=by_nset*(i-1)+1, end=by_nset*i))
        tmp <- unique(unlist(gene_sets))
        all_genes <- unique(c(all_genes, tmp))
    }
            
    # read in gene sets in batches
    for(i in seq_len(ceil)){
        gene_sets <- suppressWarnings(read_gmt(gmtfile, 
                        start=by_nset*(i-1)+1, end=by_nset*i))
        # transform gene sets to sparseMatrix
        mat <- gs2mat(gene_sets)
        miss_genes <- setdiff(all_genes, rownames(mat))
        patch <- matrix(0, nrow=length(miss_genes), ncol=ncol(mat))
        rownames(patch) <- miss_genes
        colnames(patch) <- colnames(mat)
        mat <- rbind(mat, patch)
        append2H5(x=mat[all_genes,], dest_h5, printstatus=FALSE)
    }
    h5ls(dest_h5)
}

gs2mat <- function(gene_sets){
    gsc <- GeneSetCollection(mapply(function(geneIds, setId) {
        GeneSet(geneIds, geneIdType=EntrezIdentifier(),
                setName=setId)
    }, gene_sets, names(gene_sets)))
    mat <- incidence(gsc)
    mat <- Matrix::t(mat)
    return(mat)
}

# readHDF5chunk <- function(h5file, colindex=seq_len(10), colnames=NULL) {
#     if(! is.null(colnames)){
#         all_trts <- h5read(h5file, "colnames", drop=TRUE)
#         colindex2 <- which(all_trts %in% colnames)
#         m <- h5read(h5file, "assay", index=list(NULL, colindex2))
#         colindex <- colindex2
#     } else {
#         m <- h5read(h5file, "assay", index=list(NULL, colindex))
#     }
#     mycol <- h5read(h5file, "colnames", index=list(colindex, 1))
#     myrow <- h5read(h5file, "rownames")
#     rownames(m) <- as.character(myrow[,1])
#     colnames(m) <- as.character(mycol[,1])
#     if(! is.null(colnames)){
#         m <- m[,colnames, drop=FALSE]
#     }
#     se <- SummarizedExperiment(assays=list(score=m))
#     h5closeAll()
#     return(se)
# }

#' @importFrom ExperimentHub ExperimentHub

determine_refdb <- function(refdb){
    eh <- suppressMessages(ExperimentHub())
    if(refdb=="cmap"){
        return(eh[["EH3223"]])
    }
    if(refdb=="cmap_expr"){
        return(eh[["EH3224"]])
    }
    if(refdb=="lincs"){
        return(eh[["EH3226"]])
    }
    if(refdb=="lincs_expr"){
        return(eh[["EH3227"]])
    } else {
        return(refdb)
    }
}

load_sqlite <- function(eh_id){
    eh <- suppressMessages(ExperimentHub())
    path <- suppressMessages(eh[[eh_id]])
    conn <- dbConnect(SQLite(), path)
    return(conn)
}

select_ont <- function(res, ont, GO_DATA){
    # Add and select ontology in res
    res <- add_GO_Ontology(res, GO_DATA)
    tmp_df <- result(res)
    colnames(tmp_df)[1] = "ont"
    tmp_df$ont = as.character(tmp_df$ont)
    result(res) <- tmp_df
    if(ont != "ALL")
        result(res) <- as_tibble(res[res$ont == ont, ])
    return(res)
}

#' Functionalities used to draw from reference database 
#' (e.g. lincs, lincs_expr) GESs of compound treatment(s) in cell types.
#' 
#' The GES could be genome-wide differential expression profiles (e.g. log2 
#' fold changes or z-scores) or normalized gene expression intensity values 
#' depending on the data type of \code{refdb} or n top up/down regulated DEGs
#' @title Draw GESs from Reference Database 
#' @rdname getSig
#' @param cmp character vector representing a list of compound name available 
#' in \code{refdb} for \code{getSig} function, or character(1) indicating a
#' compound name (e.g. vorinostat) for other functions
#' @param cell character(1) or character vector of the same length as cmp 
#' argument. It indicates cell type that the compound treated in
#' @param refdb character(1), one of "lincs", "lincs_expr", "cmap", "cmap_expr",
#' or path to the HDF5 file built from \code{\link{build_custom_db}} function
#' @return matrix representing genome-wide GES of the query compound(s) in cell
#' @examples 
#' refdb <- system.file("extdata", "sample_db.h5", package = "signatureSearch")
#' vor_sig <- getSig("vorinostat", "SKB", refdb=refdb)
#' @export
#' 
getSig <- function(cmp, cell, refdb){
    if(is.character(refdb)){
        refdb <- determine_refdb(refdb)
        refse <- SummarizedExperiment(HDF5Array(refdb, name="assay"))
        rownames(refse) <- HDF5Array(refdb, name="rownames")
        colnames(refse) <- HDF5Array(refdb, name="colnames")
        trt <- paste(cmp, cell, "trt_cp", sep="__")
        trt2 <- intersect(trt, colnames(refse))
        notin <- setdiff(trt, colnames(refse))
        if(length(notin) > 0){
            warning(length(notin), "/", length(trt), 
                    " teatments are not contained in refdb, ", 
                    "they are ignored!")
        }
        cmp_mat <- as.matrix(assay(refse[,trt2]))
        # sort decreasingly
        #cmp_mat2 <- apply(cmp_mat, 2, sort, decreasing=TRUE)
        return(cmp_mat)
    } else {
        message(paste("Please set refdb as one of",  
                "'lincs', 'lincs_expr', 'cmap' or 'cmap_expr', "),
                "or path to an HDF5 file representing reference database!")
    }
}

#' @rdname getSig
#' @param Nup integer(1). Number of most up-regulated genes to be subsetted
#' @param Ndown integer(1). Number of most down-regulated genes to be subsetted
#' @param higher numeric(1), the upper threshold of defining DEGs. 
#' At least one of 'lower' and 'higher' must be specified.
#' If \code{Nup} or \code{Ndown} arguments are defined, it will be ignored. 
#' @param lower numeric(1), the lower threshold of defining DEGs. 
#' At least one of 'lower' and 'higher' must be specified.
#' If \code{Nup} or \code{Ndown} arguments are defined, it will be ignored. 
#' @param padj numeric(1), cutoff of adjusted p-value or false discovery rate (FDR)
#' of defining DEGs if the reference HDF5 database contains the p-value matrix 
#' stored in the dataset named as 'padj'.
#' If \code{Nup} or \code{Ndown} arguments are defined, it will be ignored. 
#' @return a list of up- and down-regulated gene label sets 
#' @examples 
#' vor_degsig <- getDEGSig(cmp="vorinostat", cell="SKB", Nup=150, Ndown=150,
#'                         refdb=refdb)
#' @export
getDEGSig <- function(cmp, cell, Nup=NULL, Ndown=NULL, 
                      higher=NULL, lower=NULL, padj=NULL, refdb="lincs"){
    deprof <- suppressMessages(getSig(cmp, cell, refdb))
    deprof_sort <- apply(deprof, 2, sort, decreasing=TRUE)
    if(!is.null(Nup) & is.null(Ndown)){
        Ndown = 0
    }
    if(is.null(Nup) & !is.null(Ndown)){
        Nup = 0
    }
    if(! is.null(Nup) & ! is.null(Ndown)){
        upset <- head(rownames(deprof_sort), Nup)
        downset <- tail(rownames(deprof_sort), Ndown)
        return(list(upset=upset, downset=downset))
    } 
    if(!is.null(higher) & is.null(lower)){
        up <- rownames(deprof_sort[deprof_sort >= higher,,drop=FALSE])
        if(!is.null(padj)){
            pdegs <- getPCut(cmp, cell, refdb, padj)
            up <- up[up %in% pdegs]
        }
        return(list(upset=up, downset=character()))
    }
    if(is.null(higher) & !is.null(lower)){
        down <- rownames(deprof_sort[deprof_sort <= lower,,drop=FALSE])
        if(!is.null(padj)){
            pdegs <- getPCut(cmp, cell, refdb, padj)
            down <- down[down %in% pdegs]
        }
        return(list(upset=character(), downset=down))
    }
    if(!is.null(higher) & !is.null(lower)){
        up <- rownames(deprof_sort[deprof_sort >= higher,,drop=FALSE])
        down <- rownames(deprof_sort[deprof_sort <= lower,,drop=FALSE])
        if(!is.null(padj)){
            pdegs <- getPCut(cmp, cell, refdb, padj)
            up <- up[up %in% pdegs]
            down <- down[down %in% pdegs]
        }
        return(list(upset=up, downset=down))
    }
    stop("You need to either set 'Nup', 'Ndown' as integers or 
    set 'higher', 'lower', 'padj' as numeric values")
}

getPCut <- function(cmp, cell, refdb, padj){
    db_path <- determine_refdb(refdb)
    pmat <- HDF5Array(db_path, name="padj")
    rownames(pmat) <- as.character(HDF5Array(db_path, name="rownames"))
    colnames(pmat) <- as.character(HDF5Array(db_path, name="colnames"))
    pval <- as.matrix(pmat[, paste(cmp, cell, "trt_cp", sep="__"), drop=FALSE])
    degs <- rownames(pval[pval <= padj,,drop=FALSE])
    return(degs)
}
#' @rdname getSig
#' @return a numeric matrix with one column representing gene expression values
#' drawn from \code{lincs_expr} db of the most up- and down-regulated genes. 
#' The genes were subsetted according to z-scores drawn from \code{lincs} db. 
#' @examples 
#' all_expr <- as.matrix(runif(1000, 0, 10), ncol=1)
#' rownames(all_expr) <- paste0('g', sprintf("%04d", 1:1000))
#' colnames(all_expr) <- "drug__cell__trt_cp"
#' de_prof <- as.matrix(rnorm(1000, 0, 3), ncol=1)
#' rownames(de_prof) <- paste0('g', sprintf("%04d", 1:1000))
#' colnames(de_prof) <- "drug__cell__trt_cp"
#' ## getSPsubSig internally uses deprof2subexpr function
#' ## sub_expr <- deprof2subexpr(all_expr, de_prof, Nup=150, Ndown=150)
#' @export
getSPsubSig <- function(cmp, cell, Nup=150, Ndown=150){
    query_expr <- suppressMessages(getSig(cmp, cell, refdb="lincs_expr"))
    query_prof <- suppressMessages(getSig(cmp, cell, refdb="lincs"))
    sub_expr <- deprof2subexpr(query_expr, query_prof, Nup=Nup, Ndown=Ndown)
    return(sub_expr)
}

deprof2subexpr <- function(all_expr, de_prof, Nup=150, Ndown=150){
    de_prof_sort <- apply(de_prof, 2, sort, decreasing=TRUE)
    upset <- head(rownames(de_prof_sort), Nup)
    downset <- tail(rownames(de_prof_sort), Ndown)
    sub_expr <- all_expr[c(upset, downset), , drop=FALSE]
    return(sub_expr)
}
    
trts_check <- function(ref_trts, full_trts){
    trts_valid <- intersect(ref_trts, full_trts)
    inval <- setdiff(ref_trts, full_trts)
    if(length(inval)>0){
        message(length(inval), 
        " treatments in ref_trts are not available in refdb, they are ignored!")
    }
    if(length(trts_valid)==0){
        stop("No ref_trts are available in refdb, ", 
             "the refdb is subsetted to empty!")
    }
    return(trts_valid)
}

#' Reduce number of characters for each element of a character vector by 
#' replacting the part that beyond Nchar (e.g. 50) character to '...'.  
#' @title Reduce Number of Character 
#' @param vec character vector to be reduced
#' @param Nchar integer, for each element in the vec, number of characters to 
#' remain
#' @return character vector after reducing
#' @examples 
#' vec <- c(strrep('a', 60), strrep('b', 30))
#' vec2 <- vec_char_redu(vec, Nchar=50)
#' @export
vec_char_redu <- function(vec, Nchar=50){
    vec <- as.character(vec)
    res <- vapply(vec, function(s){
        if(nchar(s) > Nchar){
            s2 <- substr(s, 1, 50)
            paste0(s2, "...")
        } else s
    }, FUN.VALUE=character(1))
    return(res)
}

#' Show Reduced Targets
#' 
#' Reduce number of targets for each element of a character vector by 
#' replacting the targets that beyond Ntar to '...'.  
#' @param vec character vector, each element composed by a list of targets 
#' symbols separated by '; '
#' @param Ntar integer, for each element in the vec, number of targets to show
#' @return character vector after reducing
#' @examples 
#' vec <- c("t1; t2; t3; t4; t5; t6", "t7; t8")
#' vec2 <- tarReduce(vec, Ntar=5)
#' @export
#' 
tarReduce <- function(vec, Ntar=5){
    singleTarShot <- function(s, Ntar){
        temp <- strsplit(s, ';\\s?')[[1]]
        if(length(temp)>Ntar){
            return(paste0(paste(temp[seq_len(Ntar)], collapse = "; "), "; ..."))
        } else {
            return(s)
        }
    }
    return(sapply(vec, singleTarShot, Ntar, USE.NAMES = FALSE))
}

load_OrgDb <- function(OrgDb){
    if(is(OrgDb, "character")){
        if(! require(OrgDb, character.only = TRUE)){
            stop(paste(OrgDb, "package need to be installed to use this function"))
        }
        OrgDb <- eval(parse(text = OrgDb))
    }
    return(OrgDb)
}

#' Read in gene set information from .gmt files
#' 
#' This function reads in and parses information from the MSigDB's .gmt files. 
#' Pathway information will be returned as a list of gene sets.
#' 
#' The .gmt format is a tab-delimited list of gene sets, where each line is a 
#' separate gene set. The first column must specify the name of the gene set, 
#' and the second column is used for a short description (which this function 
#' discards). For complete details on the .gmt format, refer to the Broad 
#' Institute's Data Format's page 
#' \url{http://www.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats}.
#' 
#' @param file The .gmt file to be read
#' @param start integer(1), read the gmt file from start line
#' @param end integer(1), read the gmt file to the end line, the default -1 
#' means read to the end 
#' @return A list, where each index represents a separate gene set.
#' @section Warning:
#' The function does not check that the file is correctly formatted, and may 
#' return incorrect or partial gene sets, e.g. if the first two columns are 
#' omitted. Please make sure that files are correctly formatted before reading 
#' them in using this function.
#' @examples 
#' library(signatureSearch)
#' # geneSets <- read_gmt("path/to/the/gmt/file")
#' @export
#' 
read_gmt <- function(file, start=1, end=-1){
    if (!grepl("\\.gmt$", file)[1]) {
        stop("Pathway information must be a .gmt file")
    }
    geneSetDB = scan(file, what="character", n=end-start+1, skip=start-1, 
                     sep="\n", quiet=TRUE)
    geneSetDB = suppressWarnings(strsplit(geneSetDB, "\t"))
    names(geneSetDB) = sapply(geneSetDB, "[", 1)
    geneSetDB = lapply(geneSetDB, "[", -1:-2)
    geneSetDB = lapply(geneSetDB, function(x) {
        x[which(x != "")]
    })
    geneSetDB <- geneSetDB[sapply(geneSetDB, length)>0 & !is.na(names(geneSetDB))]
    return(geneSetDB)
}

##' Mapping 'itemID' column in the FEA enrichment result table from Entrez ID
##' to gene Symbol
##'
##' @title Set Readable
##' @param tb tibble object, enrichment result table
##' @param OrgDb character(1), 'org.Hs.eg.db' for human
##' @param keyType character(1), keyType of gene
##' @param geneCol character(1), name of the column in 'tb' containing gene
##' Entrez ids separated by '/' to be converted to gene Symbol
##' @return tibble Object
##' @importFrom stats setNames
##' @importFrom AnnotationDbi columns
##' @examples 
##' data(drugs10)
##' res <- tsea_dup_hyperG(drugs=drugs10, type="Reactome", pvalueCutoff=1, 
##'                        qvalueCutoff=1)
##' res_tb <- set_readable(result(res))
##' @export
set_readable <- function(tb, OrgDb="org.Hs.eg.db", keyType="ENTREZID", geneCol="itemID"){
    OrgDb <- load_OrgDb(OrgDb)
    if (!'SYMBOL' %in% columns(OrgDb)) {
        warning("Fail to convert input geneID to SYMBOL since no SYMBOL information available in the provided OrgDb...")
    }
    
    if(! geneCol %in% colnames(tb)){
        stop("The column name defined under 'geneCol' must be contained in tb!")
    }
    gc <- setNames(strsplit(tb[[geneCol]], "/", fixed=TRUE), tb$ID)
    genes <- unique(unlist(gc))
    gn <- EXTID2NAME(OrgDb, genes, keyType)
    gc <- lapply(gc, function(i) gn[i])
    geneID <- sapply(gc, paste0, collapse="/")
    
    # gc_sym <- sapply(gc, function(genes){
    #     gn_df <- suppressMessages(select(OrgDb, keys=genes, keytype=keyType, 
    #                                      columns="SYMBOL"))
    #     gn_sym <- paste0(na.omit(gn_df$SYMBOL), collapse="/")
    #     return(gn_sym)})
    
    tb[[geneCol]] <- geneID
    return(tb)
}
Any scripts or data that you put into this service are public.
signatureSearch documentation built on April 16, 2021, 6 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
signatureSearch
Environment for Gene Expression Searching Combined with Functional Enrichment Analysis

R/utilities.R
In signatureSearch: Environment for Gene Expression Searching Combined with Functional Enrichment Analysis

Defines functions set_readable read_gmt load_OrgDb tarReduce vec_char_redu trts_check deprof2subexpr getSPsubSig getPCut getDEGSig getSig select_ont load_sqlite determine_refdb gs2mat gmt2h5 gctx2h5 append2H5 create_empty_h5 matrix2h5 sep_pcf

Documented in append2H5 create_empty_h5 gctx2h5 getDEGSig getSig getSPsubSig gmt2h5 matrix2h5 read_gmt set_readable tarReduce vec_char_redu

Try the signatureSearch package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

signatureSearch Environment for Gene Expression Searching Combined with Functional Enrichment Analysis

R/utilities.R In signatureSearch: Environment for Gene Expression Searching Combined with Functional Enrichment Analysis

Defines functions set_readable read_gmt load_OrgDb tarReduce vec_char_redu trts_check deprof2subexpr getSPsubSig getPCut getDEGSig getSig select_ont load_sqlite determine_refdb gs2mat gmt2h5 gctx2h5 append2H5 create_empty_h5 matrix2h5 sep_pcf

Documented in append2H5 create_empty_h5 gctx2h5 getDEGSig getSig getSPsubSig gmt2h5 matrix2h5 read_gmt set_readable tarReduce vec_char_redu

Try the signatureSearch package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

signatureSearch
Environment for Gene Expression Searching Combined with Functional Enrichment Analysis

R/utilities.R
In signatureSearch: Environment for Gene Expression Searching Combined with Functional Enrichment Analysis