R/gep2pep.R

#' gep2pep: creation and analysis of Pathway Expression Profiles
#'
#' Pathway Expression Profiles (PEPs) are based on the expression of
#' pathways (or generic gene sets) belonging to a collection, as
#' opposed to individual genes. \code{gep2pep} supports the conversion
#' of gene expression profiles (GEPs) to PEPs and performs enrichment
#' analysis of both pathways and conditions.
#'
#' @details
#'
#' \code{gep2pep} creates a local repository of gene sets, which can
#' also be imported from the MSigDB [1] database. The local repository
#' is in the \code{repo} format. When a GEP, defined as a ranked list
#' of genes, is passed to \code{\link{buildPEPs}}, the stored database
#' of pathways is used to convert the GEP to a PEP and permanently
#' store the latter.
#'
#' One type of analysis that can be performed on PEPs and that is
#' directly supported by \code{gep2pep} is the Drug-Set Enrichment
#' Analysis (DSEA [2]). It finds pathways that are consistently
#' dysregulated by a set of drugs, as opposed to a background of other
#' drugs. Of course PEPs may refer to non-pharmacological conditions
#' (genetic perturbations, disease states, cell types, etc.) for
#' analogous analyses. See \code{\link{CondSEA}} function.
#'
#' A complementary approach is that of finding conditions that
#' consistently dysregulate a set of pathways. This is the
#' pathway-based version of the Gene Set Enrichment Analysis
#' (GSEA). As an application example, this approach can be used to
#' find drugs mimicking the dysregulation of a gene by looking for
#' drugs dysregulating pathways involving the gene (this has been
#' published as the \code{gene2drug} tool [3]). See
#' \code{\link{PathSEA}}.
#'
#' Both DSEA and gene2drug analyses can be performed using
#' preprocessed data from
#' \url{http://dsea.tigem.it/downloads.php}. The data include
#' Connectivity Map [4] GEPs (drug-induced gene expression profiles)
#' converted to PEPs in the form of a \code{gep2pep} repository.
#' 
#' Naming conventions:
#'
#' \itemize{
#'
#'   \item{pathway: }{any set of gene identifiers (not necessarily
#'   representing a molecular pathway).}
#'
#'   \item{pathway collection: }{a set of pathways.}
#'
#'   \item{pathway database: }{a set of pathway collections, like the
#'   MSigDB.}
#'
#'   \item{Gene Expression Profile (GEP): }{a named vector where names
#'   are gene identifiers of the same type as those in the pathway
#'   database and elements are ranks ranging from 1 to the number of
#'   genes.}
#'
#'   \item{Pathway Expression Profile (PEP): }{a ranked list of
#'   pathways, as converted from a GEP according to a pathway
#'   collection.}
#'
#'   \item{condition: }{any transcriptomic-modelled biological state
#'   (drug treatment, gene knock-out, disease state, cell type, etc.)
#'   characterized by an induced GEP and therefore a PEP.}
#'
#'   \item{gep2pep repository: }{a pathway database and possibly a
#'   related database of PEPs as created by the \code{gep2pep}
#'   package. It is implemented in \code{repo} format.}
#' }
#' @references 
#' [1] Subramanian A. et al. Gene set enrichment analysis: A
#'     knowledge-based approach for interpreting genome-wide
#'     expression profiles. PNAS 102, 15545-15550 (2005).
#'
#' [2] Napolitano F. et al, Drug-set enrichment analysis: a novel tool
#'     to investigate drug mode of action. Bioinformatics 32, 235-241
#'     (2016).
#' 
#' [3] Napolitano, F. et al. gene2drug: a computational tool for
#'     pathway-based rational drug repositioning. Bioinformatics
#'     (2017). https://doi.org/10.1093/bioinformatics/btx800
#' 
#' [4] Lamb, J. et al. The Connectivity Map: Using Gene-Expression
#'     Signatures to Connect Small Molecules, Genes, and Disease. Science
#'     313, 1929-1935 (2006).
#' @docType package
#' @name gep2pep-package
#' @author Francesco Napolitano \email{franapoli@@gmail.com}
#' @aliases gep2pep
NULL

## repo is for storage of repositories
#' @import repo

## hdf5 is for raw-mode (large matrices)
#' @import rhdf5

## For gene sets management
#' @import GSEABase

## digest is needed to cache merged profiles
#' @importFrom digest digest

## iter is used with foreach
#' @importFrom iterators iter

#' @importFrom foreach foreach %dopar% %do%
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom stats ks.test pchisq
#' @importFrom XML xmlTreeParse xmlAttrs
#' @importFrom Biobase mkScalar
#' @importFrom methods is new

NULL

#' A class to contain categorized gene set collection
#'
#' This class is a simple generalization of the
#' \code{BroadCollection} function of \code{GSEABase} to store gene
#' sets having assigned categories and subcategories that can be
#' different from those of the MSigDB.
#'
#' @slot category A character defining the main category that the gene
#'     set belongs to.
#' @slot subCategory A character defining the secondary category that
#'     the gene set belongs to.
#' @name CategorizedCollection-class
#' @rdname CategorizedCollection-class
#' @export
setClass("CategorizedCollection",
         contains = "CollectionType",
         representation = representation(
             category = "character",
             subCategory = "character"),         
         prototype = prototype(
             type = mkScalar("Categorized"),
             category = "uncategorized",
             subCategory = "uncategorized"),
         )

#' Constructor method for objects of class CategorizedCollection.
#'
#' See \code{CategorizedCollection-class}.
#'
#' @param category A character defining the main category that the
#'     gene set belongs to.
#' @param subCategory A character defining the secondary category that
#'     the gene set belongs to.
#' @return An object of class \code{CategorizedCollection}.
#' @export
#' @examples
#'
#' library(GSEABase)
#' gs1 <- GeneSet(setName="set1", setIdentifier="101")
#' collectionType(gs1) <- CategorizedCollection()
CategorizedCollection <- function(category="uncategorized",
                                  subCategory="uncategorized") {
    if (length(category)!=1)
        stop("category must be a scalar (length = 1)")
    
    new("CategorizedCollection",
        category=category,
        subCategory=subCategory)
}

#' Loads sample Gene Expression Profiles
#'
#' @return Sample gene expression data
#' @export
#' @examples
#'
#' geps <- loadSampleGEP()
#' 
loadSampleGEP <- function() {
    return(
        readRDS(system.file("extdata", "testgep.RDS",
                            package="gep2pep"))
    )
}

#' Loads sample pathway collections
#'
#' @return Sample pathway collections in \code{GeneSetCollection}
#'     format
#' @export
#' @examples
#'
#' geps <- loadSampleGEP()
#'
loadSamplePWS <- function() {
  db <- readRDS(system.file("extdata", "testgmd.RDS",
                            package="gep2pep"))
  db <- as.CategorizedCollection(db)
  return(db)
}


setMethod("show",
          signature=signature(object="CategorizedCollection"),
          function(object) {
              cat("collectionType: ", collectionType(object), "\n",
                  "  category: ",
                  attr(object, "category"), "\n",
                  "  subCategory: ",
                  attr(object, "subCategory") ,"\n", sep="")
          })


#' Converts GeneSetCollection objects to CategorizedCollection objects.
#'
#' @param GScollection An object of class \code{GeneSetCollection}.
#' @param category The name of the category that all the gene sets
#'     will be assigned to (see details).
#' @param subCategory The name of the subcategory that all the gene
#'     sets will be assigned to (see details).
#' @return A CategorizedCollection object
#' @details This function sets the \code{CollectionType} for each set
#'     in the collection to {CategorizedCollection}. If
#'     \code{GScollection} contains \code{BroadCollection} gene sets,
#'     their fields \code{category} and \code{subcategory} will be
#'     used. Otherwise the \code{category} and \code{subcategory}
#'     fields will be used.
#' @examples
#' \dontrun{
#' 
#' ## To run this example, first obtain the MSigDB database in XML
#' ## format (see
#' ## http://software.broadinstitute.org/gsea/downloads.jsp). It is
#' ## assumed that the database is locally available as the file
#' ## "msigdb_v6.0.xml".
#'
#' The \code{importMSigDB.xml} function is just a shortcut to the
#' following:
#'
#' db <- getBroadSets("msigdb_v6.1.xml")
#' db <- as.CategorizedCollection(db)
#'
#' ## The database is now in an acceptable format to create a local
#' ## repository using createRepository
#' }
#'
#' ## A small sample of the MSigDB as imported by importMSigDB.xml is
#' ## included in gep2pep. The following creates (and deletes) a
#' ## gep2pep repository.
#'
#' db_sample <- loadSamplePWS()
#' 
#' ## The function \code{as.CategorizedCollection} can also be used to
#' ## create arbitrary gene set collections specifying the categories
#' ## and subcategories once for all the sets:
#'
#' library(GSEABase)
#' mysets <- as.CategorizedCollection(
#'               GeneSetCollection(
#'                   list(GeneSet(c("g1", "g2"), setName="set1"),
#'                        GeneSet(c("g3", "g4"), setName="set2"))
#'                   ),
#'               category="mycategory",
#'               subCategory="mysubcategory"
#'               )
#' newCollection <- GeneSetCollection(c(db_sample, mysets))
#'
#' ## The created repository will include both the sample gene sets
#' ## and the two sets just created:
#' 
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#' rp <- createRepository(repo_path, newCollection)
#' 
#' ## removing temporary repository
#' unlink(repo_path, TRUE)
#' @export
as.CategorizedCollection <- function(GScollection,
                                     category="uncategorized",
                                     subCategory="uncategorized")
{
    if(!is(GScollection, "GeneSetCollection"))
        say("collection must be an object of class GeneSetCollection",
            "error")

    y <- GeneSetCollection(
        sapply(GScollection,
               function(g) {
                   
                   if(is(collectionType(g), "BroadCollection")) {
                       coll <- attributes(collectionType(g))
                       originalID <- setIdentifier(g)
                       collectionType(g) <-
                           CategorizedCollection(coll$category,
                                                 coll$subCategory)
                       setIdentifier(g) <- originalID
                   } else if(is(collectionType(g), "CategorizedCollection")) {
                       ## nothing to do
                   } else {
                       collectionType(g) <- CategorizedCollection(
                           category, subCategory)
                   }
                   g
               })
    )

}

#' Converts gene set IDs to gene set names
#'
#' @param sets An object of class GeneSetCollection
#' @param ids character vector of gene set IDs to be converted to set
#' names
#' @return A vector of gene set names
#' @seealso CondSEA, PathSEA
#' @examples
#' collection <- loadSamplePWS()
#' setId2setName(collection, c("M3128", "M11607"))
#' @export
setId2setName <- function(sets, ids) {
    allids <- sapply(sets, setIdentifier)
    snm <- sapply(sets, setName)
    if(missing(ids))
        return(snm)
    return(snm[match(ids, allids)])
}


#' Imports PEPs created in raw mode
#'
#' Raw mode is meant to deal with large collections of PEPs (like
#' hundreds of thousands). In this case, problems may arise while
#' trying to convert GEPs by loading all of them in memory at
#' once. Raw mode is meant to be used with HDF5 format, which allows
#' to load subsets of GEPs from the disk. \code{buildPEPs}, when used
#' in raw mode, can create the corresponding subsets of PEPs, so that
#' the job can be distributed on a computer
#' cluster. \code{importFromRawMode} is meant to join the chunks into
#' HDF5 matrices, which are than stored into the repository. The
#' \code{.loadPEPs} function can seamlessly load PEPs stored in normal
#' (RDS) or HDF5 format.
#'  
#' @inheritParams dummyFunction
#' @param path Path were raw PEPs are stored (default is a "raw"
#'     directory under the repository root folder).
#' @return Nothing, used for side effects.
#' @seealso buildPEPs
#' @details PEPs are expect to be found at the specified \code{path}
#'     and follow the naming convention as generated by
#'     \code{buildPEPs}. According to such convention, each file is
#'     named usign the format
#'     category_subcategory#chunknumber.RDS. All non-alphanumeric
#'     characters from the original category and subcategory names are
#'     replace with an underscore (in rare cases this could create
#'     ambiguity that should be manually prevented). All chunks for
#'     the same subcategory are joined together following the chunk
#'     numbers into a single HDF5 matrix and stored in the repository
#'     as an "attachment" (see \code{repo} documentation).
#'
#'     Note that raw PEPs (by default everything at
#'     repository_root/raw) can be safly removed once they have been
#'     imported.
#' @export
#' @examples
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#'
#' ## The following will create PEPs in 2 separate files
#' geps <- loadSampleGEP()
#' buildPEPs(rp, geps[,1:2], progress_bar=FALSE,
#'     rawmode_id=1)
#' buildPEPs(rp, geps[,3:5], progress_bar=FALSE,
#'     rawmode_id=2)
#'
#' ## The separate files are then merged into one (possibly big) file
#' ## in HDF5 format
#'
#' importFromRawMode(rp)
#'
#' ## Now most operations (excluding the addition of new PEPs to
#' ## existing collections) will be available as usual.
#'
#' unlink(repo_path, TRUE)
importFromRawMode <- function(rp, path=file.path(rp$root(), "raw"),
                              collections="all")
{
    
    allfiles <- list.files(path)
    say(paste0("Found ", length(allfiles), " raw files."))
    dbs <- gsub("#.+[.]RDS", "", allfiles)
    say(paste0("Found ", length(unique(dbs)), " collection names."))
    
    ## real collection names (not stored in filenames)
    collnames <- getCollections(rp)
    cleanedcollnames <- gsub("[^[:alnum:]]", "_", collnames)
    
    for(i in seq_along(unique(dbs))) {
        dbi <- unique(dbs)[i]
        collname <- collnames[match(dbi, cleanedcollnames)]

        if(!(length(collections==1) && collections=="all")) {
            if(! collname %in% collections) {
                say(paste0("Collection ", collname,
                           " was not selected and will be skipped."))
                next
            }
        }

        if(rp$has(collname)) {
            say(paste0("A collection named ", collname,
                       " is already in the repository ",
                       "and will be skipped in raw mode"), "warning")
            next
        }
        
        say(paste0("Working on collection: ", collname))        
        
        subfiles <- allfiles[grep(dbi, allfiles)]
        ids <- as.numeric(gsub(".+#|[.]RDS", "", subfiles))
        say(paste0("Found ", length(unique(ids)), " unique IDs."))

        ## extracting chunk size
        fname <- paste0(dbi, "#", min(ids), ".RDS")
        Nchunk <- ncol(readRDS(file.path(path, fname))$ES)
        fname <- paste0(dbi, "#", max(ids), ".RDS")
        NlastChunk <- ncol(readRDS(file.path(path, fname))$ES)
        Ncol <- Nchunk*(length(unique(ids))-1) + NlastChunk
        say(paste0("Expected profiles: ", Ncol, " in chunks of size: ", Nchunk))

        say(paste0("Creating a repository entry."))
        fl <- tempfile()
        h5createFile(fl)
        rp$put(fl, collname, asattach=TRUE, tags=c("pep", "#hdf5"))
        fl <- rp$get(collname)

        ## attachments are hidden by default, but unhiding them
        ## triggers a warning because of a bug in repo
        curwarn <- options()$warn
        options(warn=-1)
        rp$untag(collname, "hide")
        options(warn=curwarn)

        ## fl <- tempfile()
        ## h5createFile(fl)
        ## rp$put(fl, dbi, asattach=T, tags=c("pep", "#hdf5"))
        ## fl <- rp$get(dbi)
        
        ## fl <- rp$get(collname)

        fname <- paste0(dbi, "#", min(ids), ".RDS")
        x <- readRDS(file.path(path, fname))$ES
        Nrow <- nrow(x)
        say(paste0("Creating 2 HDF5 dataset of size: ", Nrow, "x", Ncol))

        h5createDataset(fl, "ES", c(Nrow, Ncol),
                        maxdims=c(Nrow*10, Ncol*10),
                        chunk=c(Nrow, Nchunk))
        h5createDataset(fl, "PV", c(Nrow, Ncol),
                        maxdims=c(Nrow*10, Ncol*10),                        
                        chunk=c(Nrow, Nchunk))
        h5createDataset(fl, "rownames", Nrow,
                        maxdims=c(Nrow*10),
                        storage.mode="character",
                        size=256, chunk=Nrow)
        h5createDataset(fl, "colnames", c(Ncol, 1), ## necessary later
                                                    ## to have h5write
                                                    ## update all the
                                                    ## col names of a
                                                    ## chunk at once
                        maxdims=c(Ncol*10, 1),
                        storage.mode="character",
                        size=256, chunk=c(Nchunk,1))
        h5write(rownames(x), fl, "rownames")      
        say("Adding chunks...")
        uids <- sort(unique(ids))
        for(j in seq_along(uids)) {
            fsize <- .format.object_size(file.size(fl), "auto")
            fname <- paste0(dbi, "#", uids[j], ".RDS")
            ifile <- file.path(path, fname)
            x <- readRDS(ifile)
            ifsize <- .format.object_size(file.size(ifile), "auto")
            startCol <- (j-1)*Nchunk+1

            h5write(x$ES, fl, "ES", start=c(1,startCol),
                    createnewfile=FALSE)
            h5write(x$PV, fl, "PV", start=c(1,startCol),
                    createnewfile=FALSE)
            h5write(cbind(colnames(x$ES)), fl, "colnames", start=c(startCol,1))

            cat("Chunk ", j, " of ",
                length(uids), ", ", ifsize,
                " (current file size: ", fsize, ")\r",
                sep="")
        }
        say("Done.")
    }

    H5close()
}

#' Imports pathways data from an MSigDB XML file.
#'
#' Creates a \code{GeneSetCollection} object using the XML
#' distribution of the MSigDB (see references). The returned object
#' can be passed to \code{createRepository}.
#'
#' @param fname Path to an XML file downloaded from MSigDB.
#' @param organism Select only gene sets for a given organism. Default
#' is "Homo Sapiens".
#' @return A CategorizedCollection object
#' @references
#'     \url{http://software.broadinstitute.org/gsea/downloads.jsp}
#' @details This function now just calls \code{getBroadSets(fname)}
#'     from the \code{GSEABase} package. However, it is left for
#'     backward compatibility and as an entry point to package
#'     functionalities.
#' @examples
#' \dontrun{
#' 
#' ## To run this example, first obtain the MSigDB database in XML
#' ## format (see
#' ## http://software.broadinstitute.org/gsea/downloads.jsp). It is
#' ## assumed that the database is locally available as the file
#' ## "msigdb_v6.0.xml".
#' 
#' db <- importMSigDB.xml("msigdb_v6.0.xml")
#'
#' ## The database is now in an acceptable format to create a local
#' ## repository using createRepository
#' }
#'
#' ## A small excerpt from the MSigDB is included in gep2pep. The
#' ## following creates (and then deletes) a gep2pep repository.
#'
#' db_sample <- loadSamplePWS()
#' 
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#' rp <- createRepository(repo_path, db_sample)
#'
#' ## removing temporary repository
#' unlink(repo_path, TRUE)
#' @export
importMSigDB.xml <- function(fname, organism="Homo Sapiens") {

    say("Loading gene sets...")

    result = tryCatch({
        sets <- getBroadSets(fname)
        say(paste0("Loaded ", length(sets), " sets."))
        orgs <- sapply(sets, function(s) attributes(s)$organism)        
        if(organism != "all") {
            w <- tolower(orgs) == tolower(organism)
            sets <- sets[w]
            say(paste0("Selected ", length(sets), " sets for: ", organism))
        }
        say("Converting gene sets...")
        as.CategorizedCollection(sets)    
    }, error = function(e) {        
        say(paste("GSEABase::getBroadSets failed with the error:"))
        print(e)
        say("Parsing XML with fallback code...")
        xml <- xmlTreeParse(fname, useInternalNodes=TRUE)   
        sets <- xml["/MSIGDB/GENESET"]

        ids <- sapply(sets, function(x) xmlAttrs(x)[["SYSTEMATIC_NAME"]])
        msigDB <- data.frame(
            id = ids,
            name = sapply(sets, function(x)
                xmlAttrs(x)[["STANDARD_NAME"]]),
            category = sapply(sets, function(x)
                xmlAttrs(x)[["CATEGORY_CODE"]]),
            subcategory = sapply(sets, function(x)
                xmlAttrs(x)[["SUB_CATEGORY_CODE"]]),
            organism = sapply(sets, function(x)
                xmlAttrs(x)[["ORGANISM"]]),
            desc = sapply(sets, function(x)
                xmlAttrs(x)[["DESCRIPTION_BRIEF"]]),
            desc_full = sapply(sets, function(x)
                xmlAttrs(x)[["DESCRIPTION_FULL"]]),
            set = sapply(sets, function(x)
                xmlAttrs(x)[["MEMBERS_SYMBOLIZED"]]),
            stringsAsFactors=FALSE
        )

        say("Converting gene sets...")
        gs <- list()
        k <- 1
        for(i in seq_len(nrow(msigDB))) {
            if(tolower(msigDB[[i]]$organism) == organism) {
                gs[[i]] <- GeneSet(strsplit(msigDB$set[i], ",")[[1]],
                                   shortDescription = msigDB$desc[i],
                                   longDescription = msigDB$desc_full[i],
                                   setName = msigDB$name[i],
                                   setIdentifier = msigDB$id[i],
                                   organism = msigDB$organism[i],
                                   collectionType = CategorizedCollection(
                                       category=msigDB$category[i],
                                       subCategory=msigDB$subcategory[i]
                                   ))
                k <- k+1
            }
        }
        GeneSetCollection(gs)
    })

    say("done.")
    
    return(result)
}



#' Dummy function for parameter inheritance
#' @param rp A repository created by \code{\link{createRepository}}.
#' @param rp_peps A repository created with
#'     \code{\link{createRepository}}, and containing PEPs created
#'     with \code{\link{buildPEPs}}.
#' @param collections A subset of the collection names returned by
#'     \code{getCollections}. If set to "all" (default), all the
#'     collections in \code{rp} will be used.
#' @param collection One of the names returned by
#'     \code{getCollections}.
#' @return Nothing
#' @keywords internal
dummyFunction <- function(rp, rp_peps, collections, collection) {}


#' Check an existyng repository for consistency
#'
#' Check both repository data consistency (see \code{repo_check} from
#' the \code{repo} package) and specific gep2pep data consistency.
#' @inheritParams dummyFunction
#' @return Nothing.
#' @examples
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' checkRepository(rp)
#'
#' unlink(repo_path, TRUE)
#' @export
checkRepository <- function(rp) {
    say("Checking repository consistency...")
    rp$check()

    message()
    say("Cheching for gep2pep data consistency...")
    message()
    
    dbs <- getCollections(rp)
    perts <- rp$get("conditions")
    if(is.null(perts[[1]])) ## this happens due to a bug
        perts <- perts[-1]

    say("Checking conditions list...")
    problems <- FALSE
    
    off <- setdiff(names(perts), dbs)
    if(length(off>0)) {
        say(paste("The following collections are in conditions",
                  "list but not in repository collections:"),
            "warning", off)
        problems <- TRUE
    }

    off <- setdiff(names(perts), dbs)
    if(length(off>0)) {
        say(paste("The following collections are in repository",
                  "collections but not in conditions list:"),
            "warning", off)
        problems <- TRUE
    }

    w <- sapply(lapply(rp$entries(), get, x="tags"), `%in%`, x="pep")
    pepitems <- names(w[w])

    off <- setdiff(pepitems, dbs)
    if(length(off>0)) {
        say(paste("The following collections have PEPs but not",
                  "pathways in the repository:"),
            "warning", off)
        problems <- TRUE
    }

    if(!problems)
        say("Ok.")

    if(length(perts)>0) {

        say("Checking PEPs...")
        for(i in seq_along(dbs)) {
            problems <- FALSE
            
            say(paste0("Checking collection: ", dbs[i], "..."))
            if(rp$has(dbs[i])) {
                peps <- rp$get(dbs[i])

                if(!identical(colnames(peps$ES), perts[[dbs[i]]])) {
                    say(paste("Column names in the PEP matrix differ from",
                              "those in the conditions repository item:",
                              "this is a serious inconsistency!"),
                        "warning")
                    problems <- TRUE
                }

                sets <- .loadCollection(rp, dbs[i])

                if(!identical(colnames(peps$ES), colnames(peps$PV))) {
                    say(paste("Column names of the ES matrix are not",
                              "identical to column names of PV matrix:",
                              "this is a serious inconsistency!"),
                        "warning")
                    problems <- TRUE
                }

                if(!identical(rownames(peps$ES), rownames(peps$PV))) {
                    say(paste("Row names of the ES matrix are not",
                              "identical to row names of PV matrix!",
                              "this is a serious inconsistency!"),
                        "warning")
                    problems <- TRUE
                }

                if(!setequal(names(sets), rownames(peps$PV))) {
                    say(paste("There are pathways in the repository that",
                              "are not in the PEP matrix."),
                        "warning")
                    problems <- TRUE
                }

                if(!problems)
                    say("ok.")
            }
        }

        say(paste("Checking wether different PEP collections",
                  "include the same conditions..."))
        out <- outer(perts, perts,
                     Vectorize(
                         function(a,b) length(intersect(a,b))
                     ))
        if(length(unique(as.vector(out)))!=1) {
            say(paste("Not all collections include the same conditions.",
                      "The intersection matrix follows."))
            print(out)
        } else say("All PEP collections have the same conditions")
        
    } else say("No conditions to check.")
    
}



#' Loads the matrix of Enrichment Scores for a collection
#'
#' @inheritParams dummyFunction
#' @return The matrix of Enrichment Scores (ES) of the
#'     Kolmogorov-Smirnov statistic for the pathway collection, if
#'     previously computed with \code{buildPEPs}. The entry \code{i,j}
#'     reports the ES for the pathway \code{i}, condition{j}. If
#'     \code{buildPEPs} was not run, throws an error.
#' @examples
#'
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' geps <- loadSampleGEP()
#' buildPEPs(rp, geps)
#'
#' loadESmatrix(rp, "c3_TFT")[1:5,1:2]
#'
#' unlink(repo_path, TRUE)
#'
#' @export
loadESmatrix <- function(rp, collection)
{
    if(!is(collection, "character") || length(collection) > 1)
        say("Please provide a single collection name", "error")
    
    res <- getPEPs(rp, collection)$ES
    return(res)
}


#' Loads the matrix of p-values for a collection
#'
#' @inheritParams dummyFunction
#' @return The matrix of p-values (PV) of the
#'     Kolmogorov-Smirnov statistic for the pathway collection, if
#'     previously computed with \code{buildPEPs}. The entry \code{i,j}
#'     reports the PV for the pathway \code{i}, condition{j}. If
#'     \code{buildPEPs} was not run, throws an error.
#' @examples
#'
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' geps <- loadSampleGEP()
#' buildPEPs(rp, geps)
#'
#' loadPVmatrix(rp, "c3_TFT")
#'
#' unlink(repo_path, TRUE)
#'
#' @export
loadPVmatrix <- function(rp, collection)
{
    if(!is(collection, "character") || length(collection) > 1)
        say("Please provide a single collection name", "error")
    
    res <- getPEPs(rp, collection)$PV
    return(res)
}

#' Loads a collection of pathways from the repository
#'
#' @inheritParams dummyFunction
#' @return a \code{GeneSetCollection} object loaded from the
#'     repository \code{rp}.
#' @examples
#'
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' geps <- loadSampleGEP()
#' 
#' loadCollection(rp, "c3_TFT")
#'
#' unlink(repo_path, TRUE)
#'
#' @export
loadCollection <- function(rp, collection)
{
    if(!is(collection, "character") || length(collection) > 1)
        say("Please provide a single collection name", "error")

    res <- rp$get(paste0(collection, "_sets"))

    return(res)
}



#' Returns the names of the pathway collections in a repository.
#'
#' Given a \code{gep2pep} repository, returns the names of the
#' stored collections by looking at appropriate repository item
#' names.
#' 
#' @inheritParams dummyFunction
#' @return Vector of collection names (see details).
#' @details Each collection in a database has a "category" and a
#'     "subcategory" assigned, which are used to build the collection
#'     identifier as "category_subcategory". This function obtains the
#'     identifiers by looking at data stored in the repository
#'     \code{rp} (entries that are tagged with "sets").
#' @examples
#'
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' ## Repo root created.
#' ## Repo created.
#' ## [15:45:06] Storing pathway data for collection: c3_TFT
#' ## [15:45:06] Storing pathway data for collection: c3_MIR
#' ## [15:45:06] Storing pathway data for collection: c4_CGN
#'
#' getCollections(rp)
#' ## [1] "c3_TFT" "c3_MIR" "c4_CGN"
#'
#' unlink(repo_path, TRUE)
#'
#' @export
getCollections <- function(rp)
{    
    w <- sapply(lapply(rp$entries(), get, x="tags"), `%in%`, x="sets")
    res <- gsub("_sets", "", names(w[w]))
    return(res)
}


#' Creates a repository of pathway collections.
#'
#' Given a database of collections, stores them in a local repository
#' to be used by \code{gep2pep} functions.
#'
#' @param path Path to a non-existing directory where the repository
#'     will be created.
#' @param sets An object of class \code{CategorizedCollection}.
#' @param name Name of the repository. Defaults to \code{NULL} (a
#'     generic name will be given).
#' @param description Description of the repository. If NULL
#'     (default), a generic description will be given.
#' @return An object of class \code{repo} that can be passed to
#'     \code{gep2pep} functions.
#' @details \code{sets} can be created by
#'     \code{\link{importMSigDB.xml}} or using \code{GSEABase}
#'     \code{GeneSetCollection} class and then converting it to
#'     CategorizedCollection. See examples.
#' @seealso buildPEPs
#' @examples
#'
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' ## Repo root created.
#' ## Repo created.
#' ## [15:45:06] Storing pathway data for collection: c3_TFT
#' ## [15:45:06] Storing pathway data for collection: c3_MIR
#' ## [15:45:06] Storing pathway data for collection: c4_CGN
#'
#' rp
#' ##         ID    Dims     Size
#' ## c3_TFT_sets   10 18.16 kB
#' ## c3_MIR_sets   10 17.25 kB
#' ## c4_CGN_sets   10   6.9 kB
#'
#' unlink(repo_path, TRUE)
#' @export
createRepository <- function(path, sets, name=NULL, description=NULL)
{
    if(!is.null(sets) && !is(sets, "GeneSetCollection"))
        say("sets must be an object of class GeneSetCollection", "error")

    types <- sapply(sets, collectionType)
    allCat <- all(sapply(types, is, class2="CategorizedCollection"))
    if(!allCat) {
        say("all sets must be of type CategorizedCollection",
            "error")
    }
    
    if(file.exists(path)) {
        say("Can not create repository in existing folder", "error")
    } else rp <- repo_open(path, TRUE)

    if(is.null(name))
        name <- "gep2pep repository"
    if(is.null(description))
        description <- paste("This repository contains pathway information",
                             "and possibly Pathway Expression Profiles", 
                             "created with the gep2pep package.")    
    
    
    rp$project(name, description)

    ## un-hiding the project item
    ## silencing warning because of a bug in repo
    curwarn <- options()$warn
    options(warn=-1)
    rp$untag(name, "hide")
    options(warn=curwarn)

    if(!is.null(sets)) {
        db_ids <- makeCollectionIDs(sets)
        subdbs <- unique(db_ids)
        
        for(i in seq_along(subdbs))
        {
            dbi <- subdbs[i]
            say(paste("Storing pathway data for collection:", dbi))
            rp$put(sets[which(db_ids == dbi)],
                   paste0(subdbs[i], "_sets"),
                   paste("Pathway information for collection", subdbs[i]),
                   c("gep2pep", "sets"),
                   prj = name)
        }
    }

    rp$put(list(NULL), "conditions",
           "Condition lists for PEPs",
           c("gep2pep", "perts"),
           prj = name)
    
    return(invisible(rp))
}


#' Opens an existing repository of pathway collections.
#'
#' The repository must have been created by
#' \code{\link{createRepository}}. Provides an R object to interact
#' with the repository.
#'
#' @param path Path to a directory where the repository has been
#'     created with \code{\link{createRepository}}.
#' @return An object of class \code{repo} that can be passed to
#'     \code{gep2pep} functions.
#' @details This function only calls the \code{repo_open} function
#'     from the \code{repo} package on \code{path}. It is meant to
#'     allow users not to explicitly load the \code{repo} library,
#'     unless they want to access advanced features.
#' @seealso createRepository
#' @examples
#'
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' rp2 <- openRepository(repo_path)
#'
#' ## rp and rp2 point to the same data:
#' identical(rp$entries(), rp2$entries())
#' ## > [1] TRUE
#'
#' unlink(repo_path, TRUE)
#' @export
openRepository <- function(path)
{
    if(!file.exists(path)) {
        say(paste("path must point to an existing directory containing",
                  "a repository created with createRepository"), "error")
    }
    rp <- repo_open(path, TRUE)
}



#' Creates a collection label for each pathway.
#'
#' Given a database, uses "category" and "subcategory" entries to
#' create a vector of collection identifiers. Useful to extract a
#' collection from a database.
#'
#' @param sets A pathway database in the same format as output by
#'     \code{importMSigDB.xml}.
#' @return A vector of identifiers, one per pathway, with the format:
#'     "category_subcategory".
#' @seealso importMSigDB.xml
#' @examples
#' db <- loadSamplePWS()
#' ids <- makeCollectionIDs(db)
#'
#' unique(ids)
#' ## [1] "c3_TFT" "c3_MIR" "c4_CGN"
#'
#' db <- db[ids=="c3_MIR"]
#'
#' length(db)
#' ## [1] 10
#'
#' @export
makeCollectionIDs <- function(sets) {
    if(!is(sets, "GeneSetCollection"))
        say("sets must be an object of class GeneSetCollection")
    sets <- convertFromGSetClass(sets)
    
    dbs <- sapply(sets, get, x="category")
    subdbs <- sapply(sets, get, x="subcategory")
    w <- which(subdbs=="" | is.na(subdbs)) 
    if(length(w)>0)
        subdbs[w] <- dbs[w]
    db_ids <- paste(dbs, subdbs, sep="_")
    return(db_ids)
}


#' Build PEPs from GEPs and stores them in the repository.
#'
#' Given a matrix of ranked lists of genes (GEPs) and a \code{gep2pep}
#' repository, converts GEPs to PEPs and stores the latter in the
#' repository.
#'
#' @inheritParams dummyFunction
#' @param geps A matrix of ranks where each row corresponds to a gene
#'     and each column to a condition. Each column must include all
#'     ranks from 1 to the number of rows. Row and column names must
#'     be defined. Row names will be matched against gene identifiers
#'     in the pathways collections, and unrecognized gene names will
#'     not be used.
#' @param min_size An integer representing the minimum number of genes
#'     that must be included in a set before the KS statistic is
#'     computed. Smaller gene sets will get ES=NA and p=NA. Default is
#'     3. Ignored for SGE mode (see \code{addSingleGeneSets}).
#' @param max_size An integer representing the maximum number of genes
#'     that must be included in a set before the KS statistic is
#'     computed. Larger gene sets will get ES=NA and p=NA. Default is
#'     500.
#' @param parallel If TRUE, gene sets will be processed in
#'     parallel. Requires a parallel backend.
#' @param replace_existing What to do if PEPs, identified by column
#'     names of \code{geps} are already present in the repository. If
#'     set to TRUE, they will be replaced, otherwise they will be
#'     skipped and only PEPs of new conditions will be computed and
#'     added. Either ways, will throw a warning.
#' @param donotstore Just compute and return the pathway-based
#'     profiles without storing them in the repository. The repository
#'     is still required to load pathway data, however it will not be
#'     modified.
#' @param progress_bar If set to TRUE (default) will show a progress
#'     bar updated after coversion of each column of \code{geps}.
#' @param rawmode_id An integer to be appended to files produced in
#'     raw mode (see details). If set to NULL (default), raw mode is
#'     turned off.
#' @param rawmode_outdir A charater vector specifying the destination
#'     path for files produced in raw mode (by the fault it is
#'     ROOT/raw, where ROOT is the root of the repository). Ignored if
#'     \code{rawmode_id} is NULL.
#' @return Nothing. The computed PEPs will be available in the
#'     repository.
#' @seealso buildPEPs
#' @details By deault, output is written to the repository as new
#'     items named using the collection name. However, it is possible
#'     to avoid the repository and write the output to regular files
#'     turning 'raw mode' on through the \code{rawmode_id} and
#'     \code{rawmode_outdir} parameters. This is particuarly useful
#'     when dealing with very large corpora of GEPs, and conversions
#'     are split into independent jobs submitted to a scheduler. At
#'     the end, the data will need to be reconstructed and put into
#'     the repository using \code{importFromRawMode} in order to
#'     perform \code{CondSEA} or \code{PathSEA} analysis.
#' @examples
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' ## Repo root created.
#' ## Repo created.
#' ## [15:45:06] Storing pathway data for collection: c3_TFT
#' ## [15:45:06] Storing pathway data for collection: c3_MIR
#' ## [15:45:06] Storing pathway data for collection: c4_CGN
#'
#' rp
#' ##          ID   Dims     Size
#' ## c3_TFT_sets   10   18.16 kB
#' ## c3_MIR_sets   10   17.25 kB
#' ## c4_CGN_sets   10     6.9 kB
#'
#' ## Loading sample gene expression profiles
#' geps <- loadSampleGEP()
#'
#' geps[1:3,1:3]
#' ##       (+)_chelidonine (+)_isoprenaline (+/_)_catechin
#' ## AKT3               88              117            417
#' ## MED6              357              410             34
#' ## NR2E3             383              121            453
#'
#' buildPEPs(rp, geps)
#'
#' rp
#' ##           ID  Dims     Size
#' ##   c3_TFT_sets   10 18.16 kB
#' ##   c3_MIR_sets   10 17.25 kB
#' ##   c4_CGN_sets   10   6.9 kB
#' ##        c3_TFT    2  1.07 kB
#' ##        c3_MIR    2  1.07 kB
#' ##        c4_CGN    2  1.04 kB
#'
#' unlink(repo_path, TRUE)
#'
#' @export
buildPEPs <- function(rp, geps, min_size=3, max_size=500,
                      parallel=FALSE, collections="all",
                      replace_existing=FALSE, donotstore=FALSE,
                      progress_bar=TRUE,
                      rawmode_id=NULL,
                      rawmode_outdir=file.path(rp$root(), "raw"))
{
    checkGEPsFormat(geps)
    perts <- rp$get("conditions")
    rawmode <- !is.null(rawmode_id)

    if(replace_existing && donotstore)
        say("either replace_existing or donotstore can be true",
            type="error")

    ## the following is to force recomputing existing PEPs
    if(donotstore)
        replace <- existing <- TRUE 
    
    if(rawmode && rawmode_id %% 1 != 0)
        say("rawmode_id must be an integer number", type="error")
    
    if(length(collections) == 1 && collections == "all") {
        dbs <- getCollections(rp)
    } else dbs <- collections

    if(donotstore)
        res <- list()
    
    for(i in seq_along(dbs))
    {
        say(paste0("Working on collection: ", dbs[i],
                   " (", i, "/", length(dbs), ")" ))

        if(rp$has(dbs[i]))
            curpeps <- perts[[dbs[i]]] else curpeps <- NULL
        newpeps <- setdiff(colnames(geps), curpeps)
        oldpeps <- intersect(colnames(geps), curpeps)

        if(length(oldpeps > 0)) {
            if(rawmode) {
                say(paste0(length(oldpeps),
                           " existing PEPs found, ",
                           "but this will be ignored in rawmode: ",
                           paste(oldpeps, collapse=", ")))
                newpeps <- colnames(geps)
            } else {
                msg <- paste0(length(oldpeps),
                              " existing PEPs will be skipped: ",
                              paste(oldpeps, collapse=", "))
                if(replace_existing) {
                    msg <- gsub("skipped", "replaced", msg)
                    newpeps <- colnames(geps)
                }
                say(msg, type="warning")
            }
        }

        if(length(newpeps) > 0) {
            gepsi <- geps[, newpeps, drop=FALSE]
            thisdb <- .loadCollection(rp, dbs[i])

            SGEmode <- "SGE" %in% rp$tags(paste0(dbs[i], "_sets"))
            if(SGEmode) {
                say("Running in Single Gene mode")
                peps <- gep2pep(gepsi, thisdb, min_size=1, max_size,
                                parallel, progress_bar, SGEmode=TRUE)
            } else {
                peps <- gep2pep(gepsi, thisdb, min_size, max_size,
                                parallel, progress_bar, SGEmode=FALSE)
            }

            if(donotstore) {
                res[[dbs[i]]] <- peps
            } else {
                storePEPs(rp, dbs[i], peps, rawmode_id,
                          rawmode_outdir)
            }
        }
    }
    if(donotstore)
        return(res)
}



#' Export CondSEA or PathSEA results to XLS format
#'
#' The XLS output includes the full CondSEA or PathSEA results,
#' together with additional gene set information for the CondSEA. If
#' the PathSEA or CondSEA analysis was performed with
#' \code{details=TRUE}, details will be reported in the XLS file. This
#' function requires the WriteXLS library.
#'
#' @inheritParams dummyFunction
#' @param results The output of \code{CondSEA} or \code{PathSEA}.
#' @param outname Name of the XLS file to be created.
#' @return Nothing.
#' @seealso CondSEA, PathSEA
#' @examples
#'
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' geps <- loadSampleGEP()
#' buildPEPs(rp, geps)
#'
#' pgset <- c("(+)_chelidonine", "(+/_)_catechin")
#' psea <- CondSEA(rp, pgset)
#'
#' \dontrun{
#' exportSEA(rp, psea)
#' }
#'
#' unlink(repo_path, TRUE)
#'
#' @export
exportSEA <- function(rp, results, outname=NULL)
{
    
    type <- names(results)[[1]]
    if(! tolower(type) %in% c("condsea", "pathsea"))
        say("type must be on of: CondSEA, PathSEA", "error")

    if(is.null(outname))
        outname <- paste0(type, ".xls")
    
    if (requireNamespace("WriteXLS", quietly = TRUE)) {
        sheets <- attachInfo(rp, results)
        names(sheets) <- gsub(":","_",names(sheets))
        WriteXLS::WriteXLS(sheets, outname, AutoFilter=TRUE,
                           BoldHeaderRow=TRUE, FreezeRow=1)
    } else {
        stop("The suggested package WriteXLS is not installed.")
    }              
}


#' Extracts the results matrix from \code{CondSEA} or \code{PathSEA}
#' output
#'
#' @inheritParams dummyFunction
#' @param analysis The output of either \code{CondSEA} or
#'     \code{PathSEA}.
#' @return A 2-columns matrix including ESs and
#'     p-values (see details) for each pathway database and
#'     condition.
#' @seealso CondSEA, PathSEA
#' @examples
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' geps <- loadSampleGEP()
#' buildPEPs(rp, geps)
#'
#' pgset <- c("(+)_chelidonine", "(+/_)_catechin")
#' psea <- CondSEA(rp, pgset)
#'
#' getResults(psea, "c3_TFT")
#'
#' unlink(repo_path, TRUE)
#'
#' @export
getResults <- function(analysis, collection)
{
    if(!is(collection, "character") || length(collection) > 1)
        say("Please provide a single collection name", "error")

    what <- c("CondSEA", "PathSEA") %in% names(analysis)
    if(
        !is(analysis, "list") ||
        !any(what) ||
        !("details" %in% names(analysis))
    ) say("analysis does not look like the output of CondSEA or PathSEA",
          "error")

    wanalysis <- c("CondSEA", "PathSEA")[what]
    if(! collection %in% names(analysis[[wanalysis]]))
        say(paste("There are no results for the collection:", collection),
            "error")

    return(analysis[[wanalysis]][[collection]])
}


#' Extracts the details matrix from \code{CondSEA} or \code{PathSEA}
#' output
#' @inheritParams dummyFunction
#' @param analysis The output of either \code{CondSEA} or
#'     \code{PathSEA}.
#' @return A matrix including the ranks of each pathway (over rows)
#'     and each condition (over columns) used as input to
#'     \code{CondSEA} or \code{PathSEA}.
#' @seealso CondSEA, PathSEA
#' @examples
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' geps <- loadSampleGEP()
#' buildPEPs(rp, geps)
#'
#' pgset <- c("(+)_chelidonine", "(+/_)_catechin")
#' psea <- CondSEA(rp, pgset)
#'
#' getDetails(psea, "c3_TFT")
#'
#' unlink(repo_path, TRUE)
#'
#' @export
getDetails <- function(analysis, collection)
{
    if(!is(collection, "character") || length(collection) > 1)
        say("Please provide a single collection name", "error")

    what <- c("CondSEA", "PathSEA") %in% names(analysis)
    if(
        !is(analysis, "list") ||
        !any(what) ||
        !("details" %in% names(analysis))
    ) say("analysis does not look like the output of CondSEA or PathSEA",
          "error")

    wanalysis <- c("CondSEA", "PathSEA")[what]
    if(! collection %in% names(analysis[[wanalysis]]))
        say(paste("There are no results for the collection:", collection),
            "error")    
    
    return(analysis[["details"]][[collection]])
}


.loadPerts <- function(rp, coll) {
    ish5 <- "#hdf5" %in% rp$tags(coll)

    if(ish5){
        perts <- as.vector(h5read(rp$get(coll), "colnames"))
    } else perts <- colnames(rp$get(coll)$ES)
    return(perts)
}

.loadPEPs <- function(rp, coll, subset) {
    ish5 <- "#hdf5" %in% rp$tags(coll)

    if(ish5) {
        fname <- rp$get(coll)
        hcolnames <- as.vector(h5read(fname, "colnames"))
        
        if(missing(subset))
            subset <- hcolnames

        if(is.character(subset))
            subset <- match(subset, hcolnames)
        
        peps <- list(
            ES = h5read(fname, "ES", index=list(NULL, subset)),
            PV = h5read(fname, "PV", index=list(NULL, subset))
        )
        rownames(peps$ES) <- rownames(peps$PV) <-
            h5read(fname, "rownames")
        colnames(peps$ES) <- colnames(peps$PV) <-
            hcolnames[subset]
    } else {
        peps <- list(
            ES = rp$get(coll)$ES[, subset, drop=FALSE],
            PV = rp$get(coll)$PV[, subset, drop=FALSE]
        )
    }
    
    return(peps)
}


#' Performs Condition Set Enrichment Analysis
#'
#' Condition Set Enrichment Analysis (CondSEA) can be seen as a
#' Gene-SEA performed over rows (as opposed to columns) of a matrix of
#' GEPs. It tells how much a pathway is consistently dysregulated
#' under a set of conditions (such as a set of drug treatments,
#' disease states, cell types, etc.) when compared to a statistical
#' background of other conditions.
#'
#' @inheritParams dummyFunction
#' @param pgset A vector of names of conditions. Corresponding PEPs
#'     must exist in all the pathway collections currently in
#'     \code{rp}.
#' @param bgset The background against which to compare
#'     \code{pgset}. If set to \code{all} (default), all the remaining
#'     PEPs will be used. If provided, the corresponding PEPs must
#'     exist in all the pathway collections currently in \code{rp}.
#' @param details If TRUE (default) rank details will be reported for
#'     each condition in \code{pgset}.
#' @param rankingFun The function used to rank PEPs column-wise. By
#' default \code{rankPEPsByRows.ES} is used, which ranks using gene
#' set enrichment scores (see details).
#' @param usecache If set to TRUE, the computed ranked matrix will be
#' stored in the the repository (see details). FALSE by default.
#' @param sortoutput If TRUE (default) the output gene sets will be
#' sorted in order of increasing p-value.
#' @return A list of 2, by names "CondSEA" and "details". The
#' "CondSEA" entry is a 2-columns matrix including ESs and p-values
#' (see details) for each pathway database and condition. The
#' "details" entry reports the rank of each condition in \code{pgset}
#' for each pathway.
#' 
#' @details For each pathway, all conditions are ranked by how much
#' they dysregulate it (from the most UP-regulating to the most
#' DOWN-regulating). Then, a Kolmogorov-Smirnov (KS) test is performed
#' to compare the ranks assigned to conditions in \code{pgset} against
#' the ranks assigned to conditions in \code{bgset}. A positive
#' (negative) Enrichment Score (ES) of the KS test indicates whether
#' each pathway is UP- (DOWN-) regulated by \code{pgset} as compared
#' to \code{bgset}. A p-value is associated to the ES.
#'
#' When PEPs are obtained from drug-induced gene expression profiles,
#' \code{PathSEA} is the Drug-Set Enrichment Analysis [1].
#'
#' The \code{rankingFun} must take in input PEPs like those loaded
#' from the repository and return a matrix of row-wise ranks. Each row
#' must contains ranks from 1 to the number of PEPs minus the number
#' of NAs in the row.
#' 
#' When \code{usecache=TRUE}, the ranked matrix is permanently stored
#' in HDF5 format, and subsequent calls to \code{CondSEA} will load
#' from the disk the necessary ranks (not the whole matrix). The
#' correct cached data is identified by the alphabetically sorted set
#' \code{union(pgset, bgset)}, by the collection name, and by the
#' ranking function. Additional alls to CondSEA with variations of
#' these inputs will create additional cache. Cached data is hidden in
#' the repository by default and can be printed with
#' \code{rp_peps$print(all=TRUE)}, and cleared with
#' \code{clearCache(rp_peps)}.
#'
#'     
#' @seealso getResults, getDetails, clearCache
#' @references
#'
#' [1] Napolitano F. et al, Drug-set enrichment analysis: a
#'     novel tool to investigate drug mode of action. Bioinformatics
#'     32, 235-241 (2016).
#'
#' @examples
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' geps <- loadSampleGEP()
#' buildPEPs(rp, geps)
#'
#' pgset <- c("(+)_chelidonine", "(+/_)_catechin")
#' psea <- CondSEA(rp, pgset)
#'
#' res <- getResults(psea, "c3_TFT")
#'
#' ## getting the names of the top pathways
#'
#' setId2setName(loadCollection(rp, "c3_TFT"), rownames(res))
#' 
#' unlink(repo_path, TRUE)
#'
#' @export
CondSEA <- function(rp_peps, pgset, bgset="all", collections="all",
                    details=TRUE, rankingFun = rankPEPsByRows.ES,
                    usecache=FALSE, sortoutput=TRUE)
{
    dbs <- collections

    if(!is.character(pgset) || !is.character(bgset))
        say("pgset and bgset must be of class character",
           "error")
   
    if(length(dbs) == 1 && dbs=="all") {
        dbs <- getCollections(rp_peps)
    } else {
        off <- setdiff(dbs, getCollections(rp_peps))
        if(length(off)>0)
            say(paste0("The following collections could not be found: ",
                       paste(off, collapse=", ")), "error")
    }

    if(details)
        thedetails <- list() else thedetails <- NULL
    
    res <- list()
    
    for(i in seq_along(dbs)) {
        say(paste0("Working on collection: ", dbs[i]))

        allperts <- .loadPerts(rp_peps, dbs[i])

        checkMissingPerts <- function(testset, backset) {
            if(!all(testset %in% backset))
                say(paste("The following conditions could not be found:",
                          paste(
                              setdiff(testset, backset),
                              collapse = ", ")),
                    "error")
        }

        checkMissingPerts(pgset, allperts)
        if(length(bgset) == 1 && bgset=="all") {
            bgset <- allperts
        }
        checkMissingPerts(bgset, allperts)                

        if(length(intersect(pgset, bgset))>0) {
            bgset <- setdiff(bgset, pgset)
            say("Common conditions removed from bgset")
        }
        
        rankingset <- union(bgset, pgset)
        
        ranked <- .getPEPRanks(
            rp_peps, dbs[i], rankingset,
            subcols=pgset,
            rankingFun=rankingFun,
            usecache=usecache
        )
        
        say(paste0("Computing enrichments"))
        
        ks <- .doPSEA(ranked, length(rankingset))

        PVs <- sapply(ks, get, x="p.value")
        
        if(sortoutput) {
            sorter <- order(PVs)
        } else sorter <- seq_along(PVs)
        res[[dbs[i]]] <- data.frame(ES = sapply(ks, get, x="ES"),
                                    PV = PVs)[sorter, ]
        if(details)
            thedetails[[dbs[i]]] <- ranked[sorter, ]
        say("Done.")
    }

    return(list(CondSEA=res, details=thedetails))
}

.doPSEA <- function(ranked, backgroundSize, parallel=FALSE) {
    nas <- attributes(ranked)$nas

    '%dobest%' <- if (parallel) get('%dopar%') else get('%do%')
    set <- NULL ## to cope with R CMD check NOTE
 
    ##    ks <- sapply(1:nrow(ranked), function(i) {
    ks <- foreach(row = iter(ranked, by="row"),
                  nas = iter(nas)) %dobest% {
                      innas <- is.na(row)
                      inset <- row[!innas]
                      maxr <- backgroundSize - nas
                      outset <- seq_len(maxr)[-inset]
                      if(length(inset)>=1 && length(outset)>=1) {
                          res <- ks.test.2(inset, outset, maxCombSize=10^10)
                      } else res <- list(ES=NA, p.value=NA)
                      res
                  }
                      
    names(ks) <- rownames(ranked)
    return(ks)
}

#' Clear cached ranked matrices
#'
#' @inheritParams dummyFunction
#' @details This will clear everything in the repository tagged with
#' "stashed", which by default includes only matrices ranked by some
#' gep2pep functions such as \code{CondSEA}.
#' @seealso CondSEA
#' @return Nothing, used for side effects
#' @examples
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' geps <- loadSampleGEP()
#' buildPEPs(rp, geps)
#'
#' pgset <- c("(+)_chelidonine", "(+/_)_catechin")
#' psea <- CondSEA(rp, pgset, usecache=TRUE)
#'
#' ## the repository contains cached data
#' print(rp, all=TRUE)
#'
#' clearCache(rp)
#'
#' unlink(repo_path, TRUE)
#' @export
clearCache <- function(rp_peps)
{
    if("stash" %in% rp_peps$tags())
        suppressWarnings(rp_peps$stashclear(TRUE))
}

.storeCachedRanks <- function(rp_peps, name, ranks) {
    fl <- tempfile()
    h5createFile(fl)
    rp_peps$put(fl, name, asattach=TRUE, tags=c("stash", "#hdf5"))        
    fl <- rp_peps$get(name)
    Nrow <- nrow(ranks)
    Ncol <- ncol(ranks)
    ranks[is.na(ranks)] <- -1
    h5createDataset(fl, "ranks", c(Nrow, Ncol),
                    maxdims=c(Nrow*10, Ncol*10),
                    chunk=c(Nrow, Ncol))
    l <- length(attr(ranks, "nas"))
    h5createDataset(fl, "nas", l,
                    maxdims=l*10,
                    chunk=length(attr(ranks, "nas")))
    h5write(ranks, fl, "ranks")
    h5write(attr(ranks, "nas"), fl, "nas")
    h5write(colnames(ranks), fl, "colnames")
    h5write(rownames(ranks), fl, "rownames")
    H5close()
}

.getPEPRanks <- function(rp_peps, coll, rankingset,
                         subrows, subcols,
                         rankingFun, usecache=FALSE)
{
    if(!is.character(rankingset) ||
       !is.character(subcols))
        say("rankingset and subcols must be of class character",
            "error")
    meta <- list(coll, sort(rankingset), deparse(substitute(rankingFun)))
    md5 <- digest(meta)
    if(rp_peps$has(md5) && usecache) {        
        say("Loading cached ranks")
        if(missing(subrows)) subrows <- NULL
        if(missing(subcols)) subcols <- NULL
        if(!is.null(subrows)) {
            rnames <- h5read(rp_peps$get(md5), "rownames")
            subrows <- match(subrows, rnames)
        }
        if(!is.null(subcols)) {
            cnames <- as.vector(h5read(rp_peps$get(md5), "colnames"))
            subcols <- match(subcols, cnames)
        }
        ranked <- h5read(rp_peps$get(md5), "ranks",
                         index=list(subrows, subcols))
        ranked[ranked==-1] <- NA
        nas <- h5read(rp_peps$get(md5), "nas")
        attr(ranked, "nas") <- nas
    } else {
        peps <- .loadPEPs(rp_peps, coll, rankingset)
        say(paste0("Ranking collection"))
        ranked <- rankingFun(peps)

        if(usecache) {
            say(paste0("Caching ranks"))
            .storeCachedRanks(rp_peps, md5, ranked)
        }
        ## clean this
        nas <- attr(ranked, "nas")
        ranked <- ranked[subrows, subcols, drop=FALSE]
        attr(ranked, "nas") <- nas
    }
    return(ranked)
}

#' Performs Pathway Set Enrichment Analysis (PSEA)
#'
#' PathSEA is analogous to the Gene Set Enrichment Analysis (GSEA),
#' but for pathways instead of single genes. It can therefore be used
#' to look for conditions under which a given set of pathways is
#' consistently UP- or DOWN-regulated.
#'
#' @inheritParams dummyFunction
#' @param pathways A database of pathways in the same format as input
#'     to \code{\link{createRepository}}. PSEA will be performed for
#'     each database separately.
#' @param bgsets Another list like \code{pathways}, representing the
#'     statistical background for each database. If set to "all" (the
#'     default), all pathways that are in the repository and not in
#'     \code{pathways} will be used.
#' @param subset Character vector including PEP names to be
#'     considered (all by default, which may take time).
#' @param details If TRUE (default) details will be reported for each
#'     condition in \code{pgset}.
#' @param rankingFun The function used to rank PEPs column-wise. By
#'     default \code{rankPEPsByCols.ES} is used, which uses gene set
#'     enrichment scores (see details).
#' @return A list of 2, by names "PathSEA" and "details". The
#'     "PathSEA" entry is a 2-columns matrix including ESs and
#'     p-values for each collection and condition. The "details" entry
#'     reports the rank of each pathway in \code{pathways} for each
#'     condition.
#' @details For each condition, all pathways are ranked by how much
#'     they are dysregulated by it (from the most UP-regulated to the
#'     most DOWN-regulatied, according to the corresponding
#'     p-values). Then, a Kolmogorov-Smirnov (KS) test is performed to
#'     compare the ranks assigned to pathways in \code{pathways}
#'     against the ranks assigned to pathways in \code{bgsets}. A
#'     positive (negative) Enrichment Score (ES) of the KS test
#'     indicates whether each pathway is UP- (DOWN-) regulated by
#'     \code{pgset} as compared to \code{bgset}. A p-value is
#'     associated to the ES.
#'
#'     When PEPs are obtained from drug-induced gene expression
#'     profiles, \code{PathSEA} can be used together with
#'     \code{gene2pathways} to perform gene2drug [1] analysis, which
#'     predicts which drugs may target a gene of interest (or mimick
#'     such effect).
#' 
#'     The \code{rankingFun} must take in input PEPs like those loaded
#'     from the repository and return a matrix of column-wise
#'     ranks. Each column must contain ranks from 1 to the number of
#'     gene sets minus the number of NAs in the column.
#'
#' @seealso getResults, getDetails
#' @references
#'     [1] Napolitano, F. et al. gene2drug: a computational tool for
#'         pathway-based rational drug repositioning. Bioinformatics
#'         (2017). https://doi.org/10.1093/bioinformatics/btx800
#' @examples
#' library(GSEABase)
#'
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#' geps <- loadSampleGEP()
#' buildPEPs(rp, geps)
#'
#' pathways <- c("M11607", "M10817", "M16694",         ## from c3_TFT
#'               "M19723", "M5038", "M13419", "M1094") ## from c4_CGN
#' w <- sapply(db, setIdentifier) %in% pathways
#'
#' psea <- PathSEA(rp, db[w])
#' ## [15:35:29] Working on collection: c3_TFT
#' ## [15:35:29] Common pathway sets removed from bgset.
#' ## [15:35:29] Column-ranking collection...
#' ## [15:35:29] Computing enrichments...
#' ## [15:35:29] done.
#' ## [15:35:29] Working on collection: C4_CGN
#' ## [15:35:29] Common pathway sets removed from bgset.
#' ## [15:35:29] Column-ranking collection...
#' ## [15:35:29] Computing enrichments...
#' ## [15:35:29] done.
#'
#' getResults(psea, "c3_TFT")
#' ##                         ES        PV
#' ## (_)_mk_801       0.7142857 0.1666667
#' ## (_)_atenolol     0.7142857 0.1666667
#' ## (+)_isoprenaline 0.5714286 0.4000000
#' ## (+/_)_catechin   0.5714286 0.4000000
#' ## (+)_chelidonine  0.3333333 0.9333333
#'
#' unlink(repo_path, TRUE)
#'
#' @export
PathSEA <- function(rp_peps, pathways, bgsets="all",
                    collections="all", subset="all", details=TRUE,
                    rankingFun=rankPEPsByCols.SPV)
{
    checkSets(rp_peps, pathways)
    
    pathways <- convertFromGSetClass(pathways)
    
    pathways <- pwList2pwStruct(pathways)

    if(length(bgsets)==1 && bgsets != "all") {
        checkSets(bgsets)
        bgsets <- pwList2pwStruct(bgsets)
    }

    
    if(!(length(collections) == 1 && collections=="all")) {

        if(!all(collections %in% names(pathways)))
            say(paste("There is at least one selected collections for which",
                      "no pathway has been provided"), "warning")
        
        offcols <- setdiff(collections, getCollections(rp_peps))
        if(length(offcols) > 0)
            say("The following collections are not in the repository:",
                "error", offcols)
    } else collections <- getCollections(rp_peps)

    for(i in seq_along(pathways)) {
        dbi <- names(pathways)[i]
        if(dbi %in% collections && !rp_peps$has(dbi))
            say("Could not find PEPs: ", "error", names(pathways)[i])
    }
    
    collections <- intersect(names(pathways), collections)
    
    if(length(setdiff(names(pathways), collections)>1)) {
        say(paste("Removing pathways not in specified collections"))
        pathways <- pathways[collections]
    }
    
    if(details)
        thedetails <- list() else thedetails <- NULL     
    
    res <- list()
    for(i in seq_along(pathways))
    {
        say(paste0("Working on collection: ", collections[i]))
        gmd <- names(pathways[[collections[i]]])

        allsets <- names(.loadCollection(rp_peps, collections[i]))
        
        if(length(bgsets) == 1 && bgsets=="all") {
            bgset <- allsets
        } else bgset <- names(pwList2pwStruct(bgsets)[[i]])

        if(length(intersect(gmd, bgset)) > 0) {
            bgset <- setdiff(bgset, gmd)
            say("Common pathway sets removed from bgset")
        }
        rankingset <- c(gmd, bgset)
        
        if(length(subset)==1 && subset == "all") {
            peps <- .loadPEPs(rp_peps, collections[i])
        } else peps <- .loadPEPs(rp_peps, collections[i], subset)

        notok <- rankingset[rankingset %in% rownames(peps)]
        if(length(notok)>0)
            say(paste0("Pathway set ids not found in ", collections[i], ": ",
                       paste(notok, collapse=", ")), "error")

        say(paste0("Column-ranking collection"))
        ranked <- rankingFun(peps, rankingset)

        say(paste0("Computing enrichments"))
        ks <- apply(ranked, 2, function(col) {
            inset <- col[gmd]
            inset <- inset[!is.na(inset)]
            outset <- col[bgset]
            outset <- outset[!is.na(outset)]
            if(length(inset)>1 && length(outset)>1) {
                res <- ks.test.2(inset, outset, maxCombSize=10^10)
            } else res <- list(ES=NA, p.value=NA)
            res
        })
        
        PVs <- sapply(ks, get, x="p.value")
        sorter <- order(PVs)
        
        res[[collections[i]]] <- data.frame(
            ES = sapply(ks, get, x="ES")[sorter],
            PV = PVs[sorter]
        )

        if(details)
            thedetails[[collections[i]]] <- ranked[gmd, sorter]
        
        say("done")
    }

    return(list(PathSEA=res, details=thedetails))
}


#' Finds pathways including a given gene.
#'
#' Given a gene, find the set of pathways that involve it in each
#' collection of the repository. This can be used to define a set of
#' pathways for the \code{\link{PathSEA}}.
#'
#' @inheritParams dummyFunction
#' @param genes A vector of gene identifiers of the same type as that
#'     used to create the repository.
#' @param and If set to TRUE (default), will return sets containing
#'     all of \code{genes}. Otherwise will return the sets containing
#'     any of \code{genes}.
#' @return A database of pathways suitable as input to
#'     \code{\link{PathSEA}}.
#' @seealso createRepository, PathSEA
#' @examples
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#'
#' ## Finding all pathways containing "FAM126A":
#' subpw <- gene2pathways(rp, "FAM126A")
#'
#' print(names(subpw))
#'
#' unlink(repo_path, TRUE)
#'
#' @export
gene2pathways <- function(rp, genes, and=TRUE)
{
    dbs <- getCollections(rp)
    mods <- list()
    for(i in seq_along(dbs)) {
        db <- loadCollection(rp, dbs[i])
        if(and) {
            w <- sapply(db, function(s) all(genes %in% geneIds(s)))
        } else w <- sapply(db, function(s) any(genes %in% geneIds(s)))
        mods <- GeneSetCollection(c(mods, db[w]))
    }

    return(mods)        
}


## convert2SummarizedExperiment <- function(rp, res, bg)
## {
##     restype <- names(res)[[1]]
##     SEs <- list()
##     for(i in 1:length(res)) {
##         dfi <- t(data.frame(res$CondSEA[[i]], res$details[[i]]))
##         conds <- data.frame(originalNames=c(
##                                 colnames(res$CondSEA[[i]]),
##                                 colnames(res$details[[i]]))
##                             )                            
##         coll <- loadCollection(rp, names(df)[i])
##         w <- match(colnames(dfi), sapply(coll, setIdentifier))
##         nms <- data.frame(setNames=sapply(coll, setName)[w])
##         data <- list(list(dfi))
##         names(data[[1]]) <- names(df)[i]
##         se <- SummarizedExperiment(data,
##                                    rowData=conds,
##                                    colData=nms)
##         metadata(se) <- list(
##             set = colnames(res$details[[i]]),
##             background = bg,
##             sysinfo=Sys.info()
##         )
##         SEs[[i]] <- se
##     }       
## }

gep2pep <- function(geps, sets, min_size, max_size, parallel=FALSE,
                    pbar=TRUE, SGEmode=FALSE) {

    pathw <- sets
    genemat <- geps
    genes <- rownames(genemat)

    x <- list()
    sets <- sapply(unname(pathw), get, x="set")

    ES <- matrix(NA, length(pathw), ncol(genemat))
    PV <- matrix(NA, length(pathw), ncol(genemat))

    if(pbar)
        pb <- txtProgressBar()
    
    for(j in seq_len(ncol(genemat)))
    {
        if(pbar)
            setTxtProgressBar(pb, (j-1)/ncol(genemat))
        genematj <- genemat[,j]

        if(!SGEmode) {
            '%dobest%' <- if (parallel) get('%dopar%') else get('%do%')
            set <- NULL ## to cope with R CMD check NOTE
            gres <- foreach(set = sets,
                            .export=c("gsea","ks.test.2")) %dobest%
                {
                    where <- match(set, genes)
                    where <- where[!is.na(where)]
                    gsea(where, genematj, min_size, max_size)
                }

            x[[j]] <- gres
            
            if(all(is.na(sapply(gres, "get", x="ES"))))
                say(paste0("All NAs in PEP for profile: ",
                           colnames(genemat)[j]), "warning")

        } else { ## SGE mode
            nona <- genematj[!is.na(genematj)]
            pv <- .fastKSp(nona)
            es <- .fastKSES(nona)

            if(all(is.na(pv)) || all(is.na(es)))
                say(paste0("All NAs in PEP for profile: ",
                           colnames(genemat)[j]), "warning")
            PV[,j] <- pv
            ES[,j] <- es
        }
    }
        
    if(pbar) {
        setTxtProgressBar(pb, 1)
        close(pb)
    }

    if(!SGEmode)
        for(i in seq_len(ncol(genemat))){
            PV[,i] <- sapply(x[[i]], "get", x="p")
            ES[,i] <- sapply(x[[i]], "get", x="ES")
        }

    rownames(ES) <- rownames(PV) <-
        sapply(pathw, "get", x="id")
    colnames(ES) <- colnames(PV) <- colnames(genemat)
    
    return(list(ES=ES, PV=PV))
}


gsea <- function(S, ranks_list, min_size, max_size, alternative = "two.sided")
{
    S <- S[!(is.na(S))]
    S1 <- ranks_list[S]
    S2 <- ranks_list[-S]

    if(length(S1) < min_size || length(S1) > max_size ||
       length(S2) < min_size ||
       all(is.na(S1)) || all(is.na(S2)))
        return(list(ES=NA, p=NA))

    ks <- ks.test.2(S1, S2, alternative=alternative, maxCombSize=10^10)

    return(list(ES=ks$ES, p=ks$"p.value", edge=ks$edge));
}


storePEPs <- function(rp, db_id, peps, rawmode_suffix,
                      rawmode_outdir)
{
    rawmode <- !is.null(rawmode_suffix)
    
    if(rp$has(db_id) && !rawmode) {
        curmat <- rp$get(db_id)

        ## checking what's new and what's old
        curpeps <- colnames(curmat$ES)
        newpeps <- setdiff(colnames(peps$ES), curpeps)
        oldpeps <- intersect(colnames(peps$ES), curpeps)

        ## replacing existing PEPs
        curmat[["ES"]][, oldpeps] <- peps$ES[, oldpeps]
        curmat[["PV"]][, oldpeps] <- peps$PV[, oldpeps]

        ## adding new PEPs
        peps$ES <- cbind(curmat$ES, peps$ES[, newpeps, drop=FALSE])
        peps$PV <- cbind(curmat$PV, peps$PV[, newpeps, drop=FALSE])
    }

    if(!rawmode) {        
        say("Storing PEPs to the repository...")
        
        rp$put(peps, db_id,
               paste0("Pathway data for collection ", db_id,
                      ". It contains 2 matrices: 1 for enrichement scores ",
                      "(signed Kolmogorov Smirnov statistic) and one for ",
                      "the corresponding p-values."),
               c("gep2pep", "pep"), replace=TRUE,
               depends = paste0(db_id, "_sets"),
               prj = get_repo_prjname(rp))

        say("Storing condition information...")
        perts <- rp$get("conditions")
        perts[[db_id]] <- colnames(peps$ES)
        rp$set("conditions", perts)
    } else {
        fdb <- gsub("[^[:alnum:]]", "_", db_id)
        fname <- paste0(fdb,
                        "#",
                        format(rawmode_suffix, scientific=FALSE),
                        ".RDS")
        fname <- file.path(rawmode_outdir, fname)

        if(!dir.exists(rawmode_outdir))
            dir.create(rawmode_outdir)
        
        say(paste0("Storing PEPs to: ", fname))
        saveRDS(peps, fname)
    }

    say("Done.")
}

say <- function(txt, type="diagnostic", names=NULL) {
    ## can be error or warning
    msg <- paste0(
        "[",
        format(Sys.time(), format="%H:%M:%S"),
        "] ",
        txt,
        paste(names, collapse = ", ")
    )

    if(type=="error") {
        stop(msg, call.=FALSE)
    } else if (type=="warning") {
        warning(msg, call.=FALSE, immediate.=TRUE)
    } else message(msg)
}

ks.sign <- function (x, y)
{
    n.x <- as.double(length(x))
    n.y <- length(y)

    w <- c(x, y)        
    z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))

    return(sign(z[which.max(abs(z))]))
}


ks.test.2 <- function(x, y, ...) {
    ks <- ks.test(x, y, ...)
    ks[["ES"]] <- unname(ks.sign(x, y)*ks$statistic)
    return(ks)
}


rankPEPsByCols.SPV <- function(peps, rankingset="all")
{
    rankPEP <- function(PVs, ESs)
    {
        sorter <- abs(1-PVs)
        pos <- ESs > 0
        pos[is.na(pos)] <- FALSE
        sorter[pos] = -sorter[pos]        
        return(rank(sorter, ties.method = "random", na.last="keep"))
    }

    if(length(rankingset) == 1 && rankingset == "all")
        rankingset <- seq_len(nrow(peps[["ES"]]))
    
    PVs <- peps[["PV"]][rankingset, ]
    ESs <- peps[["ES"]][rankingset, ]
    x <- sapply(seq_len(ncol(PVs)), function(i) rankPEP(PVs[,i], ESs[,i]))
    colnames(x) <- colnames(PVs)
    rownames(x) <- rownames(PVs)
    attr(x, "nas") <- apply(x, 2, function(x) sum(is.na(x)))
    return(x)
}

rankPEPsByCols.NES <- function(peps, rankingset="all")
{
    if(length(rankingset) == 1 && rankingset == "all")
        rankingset <- seq_len(nrow(peps[["ES"]]))

    ESs <- t(scale(t(peps$ES)))
    attr(ESs, "scaled:center") <- NULL
    attr(ESs, "scaled:scale") <- NULL
    x <- apply(-ESs, 2, rank, ties.method = "random", na.last="keep")
    return(x)
}

rankPEPsByCols.ES <- function(peps, rankingset="all")
{
    if(length(rankingset) == 1 && rankingset == "all")
        rankingset <- seq_len(nrow(peps[["ES"]]))
  
    x <- apply(-peps$ES, 2, rank, ties.method = "random", na.last="keep")
    return(x)
}


rankPEPsByRows.ES <- function(peps)
{
    ESs <- peps[["ES"]]
    x <- t(apply(-ESs, 1, rank, ties.method = "random", na.last="keep"))
    attr(x, "nas") <- apply(x, 1, function(x) sum(is.na(x)))
    return(x)
}

attachInfo <- function(rp, results)
{
    type <- names(results)[[1]]
    dbs <- names(results[[type]])
    newres <- list()
    for(i in seq_along(results[[type]])) {
        db <- .loadCollection(rp, dbs[i])
        resmat <- results[[type]][[i]]

        if(type == "CondSEA") {
            modIDs <- rownames(resmat)
            newres[[i]] <- cbind(
                Pathway = sapply(db[modIDs], get, x="name"),
                Description = sapply(db[modIDs], get, x="desc"),
                results[[type]][[i]]
            )
            if(!is.null(results[["details"]]))
                newres[[i]] <- cbind(newres[[i]],
                                     results[["details"]][[i]])

        } else {
            res <- cbind(
                Condition = rownames(results[[type]][[i]]),
                results[[type]][[i]]
            )
            
            if(!is.null(results[["details"]])) {
                modIDs <- rownames(results[["details"]][[i]])
                modnames <- sapply(db[modIDs], get, x="name")
                details <- t(results[["details"]][[i]])

                colnames(details) <- paste0(modnames, " (",
                                            rownames(res$details[[i]]),
                                            ")")
                newres[[i]] <- cbind(res, details)
            } else {
                newres[[i]] <- res
            }
        }

    }
    names(newres) <- names(results[[type]])
    return(newres)
}


checkGEPsFormat <- function(geps)
{
    dims <- dim(geps)
    if(length(dims) != 2)
        say("GEPs must be a two-dimensional matrix", "error")
    if(dims[[1]]==1)
        say(paste("GEPs must have genes over rows,",
                  "while input matrix seems to have 1 row"),
            "error")

    gnames <- rownames(geps)
    if(is.null(gnames))
        say("GEPs must have rownames (as gene IDs)", "error")
    if(any(duplicated(gnames)))
        say("GEP rownames must be unique", "error")
    
    pnames <- colnames(geps)
    if(is.null(pnames))
        say("GEPs must have colnames (as condition IDs)", "error")
    if(any(duplicated(pnames)))
        say("GEP colnames must be unique", "error")
    
    not_unique <- apply(geps, 2, function(x) {
        any(duplicated(x))})
    mins <- apply(geps, 2, min, na.rm=TRUE)
    maxs <- apply(geps, 2, max, na.rm=TRUE)
    
    if(any(mins != 1) || any(maxs != dims[1]) || any(not_unique))
        say(paste("GEP columns must be ranks. Check",
                  "that each column is made of integer numbers from 1",
                  "to the number of rows."), "error")
    
}

get_repo_prjname <- function(rp) {
    prjname <- names(
        rp$entries()[sapply(lapply(rp$entries(), get, x="tags"),
                            `%in%`, x="#project")]
    )[[1]] ## take the first for robustness, should be only 1
}

## splits a flat list of pathways into sublists according to
## collections
pwList2pwStruct <- function(db, collids) {
    collids <- .makeCollectionIDs(db)
    colls <- unique(collids)
    out <- list()
    for(i in seq_along(colls))
        out[[colls[[i]]]] <- db[collids == colls[[i]]]
    return(out)
}


getPEPs <- function(rp, id) {
    if(! id %in% getCollections(rp))
        say(paste("Collection is not one of getCollections():", id), "error")

    if(! rp$has(id))
        say(paste("No PEP found for collection:", id), "error")
    
    return(rp$get(id))
}


checkSets <- function(rp, sets) {        
    coll_ids <- makeCollectionIDs(sets)
    ucoll_ids <- unique(coll_ids)
    
    off <- setdiff(ucoll_ids, getCollections(rp))
    if(length(off)>0)
        say(paste0("The following collections could not be found: ",
                   paste(off, collapse=", ")), "error")

    for(i in seq_along(ucoll_ids)) {
        sub <- sapply(sets[which(coll_ids == ucoll_ids[i])], setIdentifier)
        coll <- .loadCollection(rp, ucoll_ids[i])
        off <- setdiff(sub, names(coll))
        if(length(off) > 0)
            say(paste0("The following pathways could not be found ",
                       "in collection ", ucoll_ids[i], ": "), "error", off)
    }
}

convertFromGSetClass <- function(gsets) {
    res <- list()
    for(i in seq_along(gsets)) {
        set <- gsets[[i]]
        res[[setName(set)]] <- list(
            id = setIdentifier(set),
            name = setName(set),
            category = attributes(collectionType(set))$category,
            subcategory = attributes(collectionType(set))$subCategory,
            organism = organism(set),
            desc = description(set),
            set = geneIds(set)
        )
    }
    names(res) <- sapply(res, get, x="id")
    return(res)
}

.loadCollection <- function(rp, db) {
    thisdb <- rp$get(paste0(db, "_sets"))
    return(convertFromGSetClass(thisdb))
}

## loadCollection <- function(rp, db) {
##     thisdb <- rp$get(paste0(db, "_sets"))
##     return(thisdb)
## }


## this calls makeCollectionIDs with the old format
.makeCollectionIDs <- function(sets) {
    dbs <- sapply(sets, get, x="category")
    subdbs <- sapply(sets, get, x="subcategory")
    w <- which(subdbs=="" | is.na(subdbs))
    if(length(w)>0)
        subdbs[w] <- dbs[w]
    db_ids <- paste(dbs, subdbs, sep="_")
    return(db_ids)
}


#' Merge multiple PEPs to build a repository of consensus PEPs
#' @inheritParams dummyFunction
#' @param rpIn_path path to existing gep2pep repository
#' @param rpOut_path path where the new merged repository will be
#' created
#' @param mergestr a named list of character vectors, each one
#' including a set of PEP names. For each list entry, a consensus PEP
#' will be created and assigned the entry name.
#' @param progressBar if TRUE, show a progress bar
#' @details The merging is performed as follows. Given N PEPs, the
#' corresponding consensus PEP will get as enrichement score the
#' average enrichment scores of the N PEPs, and as p-value the
#' composition of the N PEP p-values by Fisher's method.
#' @return Nothing, used for side effects.
#' @export
#' @examples
#'
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#' rp <- createRepository(repo_path, db)
#' geps <- loadSampleGEP()
#' buildPEPs(rp, geps)
#'
#' mergestr <- list(
#'     che_iso = c("(+)_chelidonine", "(+)_isoprenaline"),
#'     cat_mk = c("(+/_)_catechin", "(_)_mk_801")
#' )
#'
#' merged_path <- file.path(tempdir(), "gep2pepTempMerged")
#'
#' createMergedRepository(repo_path, merged_path, mergestr)
#'
#' unlink(repo_path, TRUE)
#' unlink(merged_path, TRUE)
createMergedRepository <- function(rpIn_path, rpOut_path, mergestr,
                                   progressBar=TRUE, collections="all") {
  ## the following are hidden parameters
  mergeFunc <- "mean"
  parallel <- FALSE

    if(!missing(mergestr)) {
      if(!is.list(mergestr)) {
        say(
            paste("mergestr parameter must be a list of character",
                  "(column names) or numbers (column indices)"),
          "error"
          )
      }        
        if(is.null(names(mergestr)) || any(duplicated(names(mergestr)))) {
            say(paste("mergestr entries must be uniquely named"), "error")
        }
    }

    ## if(any(duplicated(unlist(mergestr)))) {
    ##     say("mergestr includes duplicated entries", "error")
    ## }

    
    rpin <- openRepository(rpIn_path)
    if(!file.exists(rpOut_path)) {
        rpout <- createRepository(rpOut_path, NULL)
    } else {
        say("Output repository already exists")
        rpout <- openRepository(rpOut_path)
        if(missing(mergestr)) {
            say("Loading merge structure from the repository")
            mergestr <- rpout$get("merging structure")
        }
    }

    colls <- getCollections(rpin)
    if(!(length(collections)==1 && collections=="all")) {
        colls <- intersect(colls, collections)
    }

    if(length(colls)>0) {
        for(i in seq_along(colls)) {
            say(paste0("Copying sets for collection: ", colls[i]))
            rpin$copy(rpout, paste0(colls[i], "_sets"))        
            say("Merging PEPs")
            mergedPEPs <- .mergePEPs(rpin, colls[i], mergestr,
                                     progressBar, mergeFunc, parallel)
            suppressMessages(storePEPs(rpout, colls[i], mergedPEPs, NULL, NULL))
            say("Merged PEPs stored")
        }
        say("Storing merging structure")
        
        if(rpout$has("merging structure")) {
            say("Existing merging structure will not be overwritten",
                "warning")
        } else {
            rpout$put(mergestr, "merging structure",
                      "list of single PEPs used to make each merged PEP")
        }
        
        say("Done.")
    } else say("None of the specified collections is available", "warning")
}





.mergePEPs <- function(rpin, coll, mergestr, progressBar,
                       mergeFunc = "mean", parallel=FALSE) {
    
    if(!is.character(unlist(mergestr)))
        say("mergestr entries must be of class character", "error")
    ish5 <- "#hdf5" %in% rpin$tags(coll)
    
    nrows <- length(loadCollection(rpin, coll))
    
    outpeps <- list(
        ES=matrix(NA, nrows, length(mergestr)),
        PV=matrix(NA, nrows, length(mergestr))
    )

    if(progressBar)
        pb <- txtProgressBar()

    if(mergeFunc == "mean") {
        if(parallel)
            say("Parallelization will not be used for mean method")
        if(!ish5)
            allpeps <- .loadPEPs(rpin, coll)

        for(i in seq_along(mergestr)) {
            if(progressBar)
                setTxtProgressBar(pb, i/length(mergestr))

            if(ish5) {
                peps <- .loadPEPs(rpin, coll, mergestr[[i]])
            } else {
                peps <- list(ES=allpeps$ES[, mergestr[[i]], drop=FALSE],
                             PV=allpeps$PV[, mergestr[[i]], drop=FALSE])
            }

            mpeps <- list(
                ES = apply(peps$ES, 1, mean),
                PV = apply(peps$PV, 1, function(ps) {
                    S=-2*sum(log(ps))
                    cump <- pchisq(S, 2*length(ps))
                    return(cump)
                })
            )

            outpeps$ES[,i] <- mpeps$ES
            outpeps$PV[,i] <- mpeps$PV
        }

        if(ish5) {
            rownames(outpeps$ES) <-
                rownames(outpeps$PV) <-
                rownames(peps$ES)
        } else {
            rownames(outpeps$ES) <-
                rownames(outpeps$PV) <-
                rownames(allpeps$ES)
        }

    } else if(mergeFunc == "CondSEA") {
        for(i in seq_along(mergestr)) {
            if(progressBar)
                setTxtProgressBar(pb, i/length(mergestr))
        
            ranked <- suppressMessages(
                .getPEPRanks(rpin, coll,
                             unlist(mergestr),
                             subcols = mergestr[[i]],
                             rankingFun = rankPEPsByRows.ES
                             )
                )

            ks <- .doPSEA(ranked, length(unlist(mergestr)), parallel)
                                  
            outpeps$PV[,i] <- sapply(ks, get, x="p.value")
            outpeps$ES[,i] <- sapply(ks, get, x="ES")            
        }

        rownames(outpeps$ES) <-
            rownames(outpeps$PV) <-
            rownames(ranked)

    } else say("Wrong merging method", "error")

    if(progressBar)
        close(pb)    
    
    colnames(outpeps$ES) <-
        colnames(outpeps$PV) <-
        names(mergestr)

    return(outpeps)
}


.buildGene2SetIndex <- function(rp, collection_name)
{
    collection <- loadCollection(rp, collection_name)
    allsets <- sapply(collection, geneIds)
    setids <- sapply(collection, setIdentifier)
    allgenes <- unique(unlist(allsets))
    
    res <- list()
    pb <- txtProgressBar()
    for(i in seq_along(allgenes)) {
        setTxtProgressBar(pb, i/length(allgenes))
        w <- sapply(allsets, `%in%`, x=allgenes[i])
        res[[i]] <- c(
            gene = allgenes[i],
            sets = paste(setids[w], collapse="; ")
        )
    }
    return(res)
}

.exportCollectionsInfo <- function(rp, outfile) {
    colls <- getCollections(rp)

    dbs <- ids <- names <- vector("character")
    for(i in seq_along(colls)) {
        coll <- loadCollection(rp, colls[i])
        dbs <- c(dbs, rep(colls[i], length(coll)))
        ids <- c(ids, sapply(coll, setIdentifier))
        ## names <- c(names, paste0("[", colls[i], "] ", sapply(coll, setName)))
        names <- c(names, sapply(coll, setName))
    }

    return(cbind(dbs, ids, names))
}

## This function is copy-pasted from the utils packages, as it was not
## exported and the check command would complain about using the `:::`
## operator. Original name: utils:::.format.object_size
.format.object_size <- function (x, units = "b",standard = "auto",
                                 digits = 1L, ...)
{
    known_bases <- c(legacy = 1024, IEC = 1024, SI = 1000)
    known_units <- list(SI = c("B", "kB", "MB", "GB", "TB", "PB", 
        "EB", "ZB", "YB"), IEC = c("B", "KiB", "MiB", "GiB", 
        "TiB", "PiB", "EiB", "ZiB", "YiB"), legacy = c("b", "Kb", 
        "Mb", "Gb", "Tb", "Pb"), LEGACY = c("B", "KB", "MB", 
        "GB", "TB", "PB"))
    units <- match.arg(units, c("auto", unique(unlist(known_units), 
        use.names = FALSE)))
    standard <- match.arg(standard, c("auto", names(known_bases)))
    if (standard == "auto") {
        standard <- "legacy"
        if (units != "auto") {
            if (grepl("iB$", units)) 
                standard <- "IEC"
            else if (grepl("b$", units)) 
                standard <- "legacy"
            else if (units == "kB") 
                stop("For SI units, specify 'standard = \"SI\"'")
        }
    }
    base <- known_bases[[standard]]
    units_map <- known_units[[standard]]
    if (units == "auto") {
        power <- if (x <= 0) 
            0L
        else min(as.integer(log(x, base = base)), length(units_map) - 
            1L)
    }
    else {
        power <- match(toupper(units), toupper(units_map)) - 
            1L
        if (is.na(power)) 
            stop(gettextf("Unit \"%s\" is not part of standard \"%s\"", 
                sQuote(units), sQuote(standard)), domain = NA)
    }
    unit <- units_map[power + 1L]
    if (power == 0 && standard == "legacy") 
        unit <- "bytes"
    paste(round(x/base^power, digits = digits), unit)
}

#' Adds a collection of single-gene psuedo-sets.
#'
#' This function can be used to add single-gene (as opposed to
#' pathway) -based collections. Sets including a single gene don't need
#' to go through normal Kolmogorov-Smirnov statistic computation and
#' are treated differently for performance.
#'  
#' @inheritParams dummyFunction
#' @param genes a character vector containing the gene names. For each
#'     og them a single-gene \code{GeneSet} will be created.
#' @param organism Character vector used to annotate the created
#'     sets. "Homo Sapiens" by default.
#' @return Nothing, used for side effects.
#' @seealso buildPEPs
#' @details
#'
#' Enrichment Scores and p-values for sets including a single gene are
#' computed with dedicated (fast) routines. Although a statistic based
#' on a single gene is not efficient per se, it is useful to have data
#' in the same format as pathway-based profiles. \code{buildPEPs}
#' internally calls single gene dedicated routines whenever a gene set
#' collection is tagged (see repo function \code{tag}) with "SGE"
#' ("Single Gene Expression"), which is done automatically by
#' \code{addSingleGeneSets}. In that case, the \code{min_size}
#' parameter is ignored.
#' 
#' @export
#' @examples
#' 
#' db <- loadSamplePWS()
#' repo_path <- file.path(tempdir(), "gep2pepTemp")
#'
#' rp <- createRepository(repo_path, db)
#'
#' ## The following will create PEPs in 2 separate files
#' geps <- loadSampleGEP()
#' addSingleGeneSets(rp, rownames(geps))
#'
#' unlink(repo_path, TRUE)
#' 
addSingleGeneSets <- function(rp, genes, organism = "Homo Sapiens")
{
gs <- list()
descr <- "Pseudo-gene-set containing only gene "
for(i in seq_along(genes)) {
    g <- genes[i]
    gs[[i]] <- GeneSet(g,
                       shortDescription = paste0(descr, g),
                       longDescription = paste0(descr, g),
                       setName = g,
                       setIdentifier = paste0("SGE-", g),
                       organism = organism,
                       collectionType = CategorizedCollection(
                           category="SGE",
                           subCategory="SGE"
                       ))
    }
sge <- GeneSetCollection(gs)
rp$put(sge, "SGE_sets", "Pathway information for collection SGE",
       tags=c("gep2pep", "sets", "SGE"))
    
}

.fastKSp <- function(v) {
    ## converts a vector of ranks to corresponding KS p-values
    ## used for single-gene sets
    n <- length(v)-sum(is.na(v))
    up <- seq(1, floor(n/2))/(n/2)
    if(length(v)%%2==1)
        ret <- c(up, 1, rev(up)) else ret <- c(up, rev(up))
    return(ret[v])
}

.fastKSES <- function(v) {
    ## converts a vector of ranks to corresponding KS Enrichment
    ## Scores. Used for single-gene sets
    n <- length(v)
    up <- -1/(n-1)*(seq_len(ceiling(n/2))-1) + 1
    if(length(v)%%2==1)
        ret <- c(up[-length(up)], -rev(up)) else ret <- c(up, -rev(up))
    return(ret[v])
}



## Utilities removed from package
##
## exportPEPs <- function(rp, PEPname, outfile=paste0(PEPname, ".xls"),
##                        collections="all")
## {
##     colls <- getCollections(rp)

##     if(!(length(collections)==1 && collections=="all"))
##         colls <- intersect(collections, colls)

##     newres <- list()
    
##     for(i in 1:length(colls)) {
##         db <- .loadCollection(rp, colls[i])
##         PEPs <- .loadPEPs(rp, colls[i], PEPname)
##         ord <- order(PEPs$PV)
##         PEPs$ES <- PEPs$ES[ord,,drop=F]
##         PEPs$PV <- PEPs$PV[ord,,drop=F]
        
##         modIDs <- rownames(PEPs$ES)

##         newres[[i]] <- data.frame(
##             Pathway = sapply(db[modIDs], get, x="name"),
##             Description = sapply(db[modIDs], get, x="desc"),
##             ES = PEPs$ES[,PEPname],
##             p.value = PEPs$PV[,PEPname],
##             FDR = p.adjust(PEPs$PV, "fdr", n=sum(!is.na(PEPs$PV))),
##             check.names = FALSE
##         )
##     }
##     names(newres) <- gsub(":", "_", colls)

##     WriteXLS(newres, outfile, BoldHeaderRow = TRUE, FreezeRow = 1,
##              AutoFilter = TRUE)
## }



## do_and_show_gsea <- function(gep, set) {
##     setgenes <- geneIds(set)    
    
##     x <- gep[setgenes]
##     x <- x[!is.na(x)]
##     y <- setdiff((1:sum(!is.na(gep))), x)
##     n.x <- as.double(length(x))
##     n.y <- length(y)
##     w <- c(x, y)        
##     z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))

##     m <- which.max(abs(z))
##     s <- sign(z[m])    

##     ks <- ks.test.2(x, y, maxCombSize=10^10)
##     ES <- unname(s*ks$statistic)
##     p <- ks$p.value

##     if(s > 0) {
##         leadset <- x[x <= m]
##     } else leadset <- x[x >= m]

##     fname <- paste(drug, dbi, pwi, sep="_")

##     say("Creating PNG device")
##     pngf <- paste0(outdir, "/", fname, ".png")
##     library(Cairo)
##     CairoPNG(pngf, 480*2, 480)
##     ##png(pngf, 480*2, 480)

##     say("Creating plot")
##     plot(z, type="l", ylab="ES", xlab="Ranks",
##          main=paste0(drugname, "\n", G_allsubdbs[dbi], " - ", pwname),
##          lwd=1.5, col="gray")
##     points(m, z[m], pch=21, cex=2)
##     points(x, z[x], cex=.5)
##     if(s>0) {
##         points(x[x <= m], z[x[x <= m]], bg="green", col="green", pch=21)
##     } else {
##         points(x[x >= m], z[x[x >= m]], bg="red", col="red", pch=21)
##     }    
##     dev.off()


##     ## say("Creating table")
##     ## tabf <- paste0(outdir, "/", fname, ".txt")
##     ## tab <- cbind(profile[set,,drop=F], lead=F)
##     ## tab[names(leadset), 2] <- T
##     ## tab <- tab[order(tab[,1]),]
##     ## tab <- rbind(stats = c(ES,p), tab)
##     ## say("Saving table")
##     ## write.table(tab, tabf, quote=F, col.names=F)
##     ## say("Table saved")
## }
franapoli/gep2pep documentation built on May 30, 2019, 4:34 p.m.