R/compareGenes.R
In MineICA: Analysis of an ICA decomposition obtained on genomics data

Documented in compareGenes

##' Compute and annotate the intersection or union between contributiong genes of components originating from different IcaSet objects.
##'
##' @title Union and intersection of contributing genes
##' @param icaSets List of \code{IcaSet} objects, the \code{geneNames} of the \code{IcaSet} objects must be from the same type (e.g, gene Symbols).
##' @param keepCompByIcaSet Indices of the components to be considered in each \code{IcaSet}.
##' @param lab The names of the icaSets (e.g the names of the datasets they originate from).
##' @param cutoff The cutoff (on the absolute centered and scaled projections) above which the genes have to be considered.
##' @param type \code{"intersection"} to restrict the list of genes to the ones that are common between all datasets, or \code{"union"} to consider all the union of genes available across the datasets.
##' @param annotate If TRUE (default) the genes are annotated using function \code{writeGenes}.
##' @param file The HTML file name where the genes and their annotations are written, default is \code{type}Genes_lab1-i_lab2-j_... where i and j are the component indices contained in \code{keepCompByIcaSet}.
##' @param mart The mart object (database and dataset) used for annotation, see function \code{useMart} of package \code{biomaRt}.
##' @return A data.frame containing \describe{  
##' \item{\code{typeID(icaSets[[1]])['geneID_biomart']}:}{the gene IDs,}
##' \item{median_rank}{the median of the ranks of each gene across the \code{IcaSet} objects,} 
##' \item{analyses}{the labels of the \code{IcaSet} objects in which each gene is above the given \code{cutoff}} 
##' \item{min_rank}{the minimum of the ranks of each gene across the \code{IcaSet} objects,}
##' \item{ranks}{the ranks of each gene in each \code{IcaSet} where it is available,}
##' \item{scaled_proj}{the centered and reduced projection of each gene in each \code{IcaSet} where it is available.}}
##' @author Anne Biton
##' @export
##' @seealso \code{\link[MineICA]{writeGenes}}
##' @examples \dontrun{
##' data(icaSetCarbayo)
##' mart <- useMart("ensembl", "hsapiens_gene_ensembl")
##'
##' ## comparison of two components
##' ## here the components come from the same IcaSet for convenience
##' ## but they must come from different IcaSet in practice.
##' compareGenes(keepCompByIcaSet = c(9,4), icaSets = list(icaSetCarbayo, icaSetCarbayo),
##'              lab=c("Carbayo", "Carbayo2"), cutoff=3, type="union",  mart=mart)
##' 
##' }
##' 
compareGenes <- function(keepCompByIcaSet, icaSets, lab,  cutoff=0, type = c("union","intersection"), annotate=TRUE, file, mart=useMart("ensembl", "hsapiens_gene_ensembl")) {

    type <- match.arg(type)
    
    if (missing(file) || is.null(file))        
        file <- paste(type,"Genes_", paste(lab,"-",unlist(keepCompByIcaSet),collapse="_",sep=""),sep="",collapse="")

    
    paths <- lab

    icaSet <- p <- NULL
    comps <- 
        foreach(icaSet=icaSets, comp=keepCompByIcaSet) %dopar% {
            comp <- SByGene(icaSet)[,comp]
            names(comp) <- geneNames(icaSet)
            return(comp)
        }

    names(comps) <- paths

    comps <- llply(comps,function(x) (x-mean(x))/sd(x))

    #### intersection of the available genes
    inter <-names(comps[[1]])
    for (comp in comps)
        inter <- intersect(inter,names(comp))


    compsinter <- llply(comps,function(x,inter) x[inter], inter = inter)
    #plot(as.data.frame(compsinter))
            
    if (!missing(cutoff) && !is.null(cutoff)) {
        interCut <- names(comps[[1]][abs(comps[[1]])>cutoff])
        for (comp in comps) interCut <- intersect(interCut,names(comp)[abs(comp)>cutoff])
    }
    ### use the signs of the genes in the intersection to determine the signs of the genes in the union
    ## I use the signs in one analysis and assume that the signs are the same in the other analysis...

    if (FALSE) {
    intersigns <- sign(comps[[1]][inter])
    if (type == "union") {
        #### union of the available genes
        union <-c()
        union2signs <- c()
        for (comp in comps) {
          if (sum(sign(comp[names(intersigns[intersigns==1])]))<0) comp <- -comp
          union <- unique(c(union,names(comp)))
          union2signs <- c(union2signs,sign(comp))
        }
        print(length(union))
    }
    else if (type == "intersection") {
        if (is.null(cutoff))
            stop("When intersection is used, cutoff should be specified")
        union2signs <- c()
        for (comp in comps) {
          if (sum(sign(comp[names(intersigns[intersigns==1])]))<0) comp <- -comp
          union2signs <- c(union2signs,sign(comp[interCut]))
        }
        union <- interCut
              
    }
}

        switch(type,
              "intersection"={union <- interCut},
              "union"={union <- c();for (comp in comps) {union <- unique(c(union,names(comp)))}}
              )
    compRank <- foreach(comp=comps) %dopar% {
        r <- rank(-abs(comp))
    }
    names(compRank) <- paths #if(!is.null(path2name)) path2name[paths] else basename(paths)



    ## add analysis where the gene is present
    union2an <- 
    foreach (p=paths, comp=comps, .combine = cbind) %dopar% {
            union2an <- match(union,names(comp))
            union2an[!is.na(union2an)] <- p
            union2an[is.na(union2an)] <- ""
            names(union2an) <- union
            return(union2an)

    }
    union2nban <- apply(union2an,1,function(x) {x=as.character(x);x=x[x != ""]; length(x)})
    
    union2an <- (apply(union2an,1,paste,collapse=",",sep=""))


    ## add rank details where the gene is present
    union2proj <- 
    foreach (p=paths, comp=comps, .combine = cbind) %dopar% {
            union2p <- signif(comp[match(union,names(comp))],2)
            names(union2p) <- union
            #union2p[!is.na(union2p)] <- comp[names(union2p[!is.na(union2p)])]
            union2p[is.na(union2p)] <- ""
            #names(union2p) <- union
            return(union2p)

    }
    union2proj <- (apply(union2proj,1,paste,collapse=",",sep=""))

    ## add rank details where the gene is present
    union2rank <- as.data.frame(llply(compRank, function(comp,union) comp[union], union = union))
    union2rankdetails <-apply(union2rank,1,paste,collapse=",",sep="")
    union2rankdetails <-gsub(union2rankdetails,pattern="NA",replacement="")
    union2rankmin <- apply(union2rank, 1,min, na.rm = TRUE)
    union2rankmedian <- apply(union2rank, 1,median, na.rm = TRUE)
    names(union2rankmin) <- union
    names(union2rankmedian) <- union

    if (type=="intersection") 
        d <- data.frame(min_rank = as.character(union2rankmin), median_rank = union2rankmedian, ranks = union2rankdetails, scaled_proj = union2proj, row.names = union)
    else {
        d <- data.frame(analyses = union2an, min_rank = as.character(union2rankmin), median_rank = union2rankmedian, ranks = union2rankdetails, scaled_proj = union2proj, row.names = union)
            d$analyses <- gsub(d$analyses,pattern="^,,",replacement="", perl = TRUE)
        d$analyses <- gsub(d$analyses,pattern="^,",replacement="", perl = TRUE)
        d$analyses <- gsub(d$analyses,pattern=",,,$",replacement="", perl = TRUE)
        d$analyses <- gsub(d$analyses,pattern=",,$",replacement="", perl = TRUE)
        d$analyses <- gsub(d$analyses,pattern=",,",replacement=",")
        d$nbAn <- union2nban
    }

    d <- d[order(abs(d$median_rank), decreasing = FALSE),]
    if (nrow(d)>1000) d <- d[1:1000,]

    d$ranks <- gsub(d$ranks,pattern="^,,",replacement="", perl = TRUE)
    d$ranks <- gsub(d$ranks,pattern="^,",replacement="", perl = TRUE)
    d$ranks <- gsub(d$ranks,pattern=",,,$",replacement="", perl = TRUE)
    d$ranks <- gsub(d$ranks,pattern=",,$",replacement="", perl = TRUE)
    d$ranks <- gsub(d$ranks,pattern=",,",replacement=",")
    d$scaled_proj <- gsub(d$scaled_proj,pattern="^,",replacement="", perl = TRUE)
    d$scaled_proj <- gsub(d$scaled_proj,pattern=",,,$",replacement="", perl = TRUE)
    d$scaled_proj <- gsub(d$scaled_proj,pattern=",,$",replacement="", perl = TRUE)
    d$scaled_proj <- gsub(d$scaled_proj,pattern=",,",replacement=",")

    
    
    ### annotate genes and order by mean rank value
    #if (FALSE) {

    if (annotate) {
        writeGenes(data = d,
               filename = file,
               mart = mart, 
               typeId = typeID(icaSets[[1]])["geneID_biomart"],
               typeRetrieved = NULL,
               sortBy = "median_rank",
               colAnnot = NULL,
               decreasing = FALSE
               )
    }
    
    ## attribute meanRank to genes in the union
    return(d)


}