R/pcxn_analyze.R

# FUNCTION
# - arguments: collection,2 genesets (1 per phenotype), top correlated genesets
# - returns: data.frame which contains the pairs between the phenotype_genesets
# and their top corralated genesets after filtering. Results are sorted in
# descending correlation order

# NOTES
# - Collection: only accepting pathprint,MSigDB_H,MSigDB_C2_CP,MSigDB_C5_GO_BP
# - Phenotype_genesets: must belong to the collection and be spelled exactly
# right!
# - Do we need a search feature that allows the user to search for genesets?
# - If either of the above does not hold, the function specifies which geneset/s
# do not belong to the collection and also states their phenotype
# - The initial data.frame is filtered by minimum absolute correlation and
# maximum p-value

utils::globalVariables(c("pathprint.Hs.gs","pheatmap",
                        "pathCor_pathprint_v1.2.3_dframe",
                        "pathCor_CPv5.1_dframe", "gobp_gs_v5.1",
                        "PathCor", "p.value", "Pathway.A", 
                        "Pathway.B"))

#' Discover correlation relationships among multiple pathways/gene sets
#' identified by GSEA (gene set enrichment analysis). All the input
#' pathways/gene sets should come from the same collection. MSigDB H hallmark
#' gene sets, MSigDB C2 Canonical pathways, MSigDB C5 GO BP gene sets,
#' and Pathprint are treated as four separate collections.
#'
#' @param collection pathway's collection
#' @param phenotype_0_genesets genesets/pathways of the first group of pathways
#' @param phenotype_1_genesets genesets/pathways of the second group of pathways
#' @param adj_overlap whether the correlation coefficients are adjusted for gene
#' overlap
#' @param top most correlated genesets/pathways
#' @param min_abs_corr minimum absolute correlation
#' @param max_pval maximum p-value
#'
#' @return pcxn object
#' @export
#'
#' @examples
#' \dontrun{
#'  pcxn_analyze("pathprint",c("ABC transporters (KEGG)",
#'  "ACE Inhibitor Pathway (Wikipathways)","AR down reg. targets (Netpath)"),
#'   adj_overlap = TRUE, c("DNA Repair (Reactome)"), 10, 0.05, 0.05)
#' }

pcxn_analyze <-
    function(collection = c("pathprint","MSigDB_H","MSigDB_C2_CP",
                            "MSigDB_C5_GO_BP"),
                phenotype_0_genesets = NULL,phenotype_1_genesets = NULL,
                adj_overlap = FALSE,
                top = 10,
                min_abs_corr = 0.05,
                max_pval = 0.05 ) {
        
        if(missing(phenotype_0_genesets) && missing(phenotype_1_genesets))
            stop( "Please insert phenotype_0_genesets and/or 
                    phenotype_1_genesets arguments")
        
        pathprint.Hs.gs.rda <- NULL
        pathprint.Hs.gs <- NULL
        h_gs_v5.1 <- NULL
        cp_gs_v5.1 <- NULL
        gobp_gs_v5.1.rda <- NULL
        
        pathCor_Hv5.1_dframe <- NULL
        pathCor_GOBPv5.1_dframe <- NULL
        pathCor_CPv5.1_dframe.rda  <- NULL
        pathCor_Hv5.1_unadjusted_dframe <- NULL
        pathCor_GOBPv5.1_unadjusted_dframe <- NULL
        pathCor_CPv5.1_unadjusted_dframe.rda  <- NULL
        
        # loading the datasets, when the package is complete  use data(...)
        correct_genesets_flag_1 <- TRUE
        correct_genesets_flag_2 <- TRUE
        
        acceptable_collections = c("pathprint","MSigDB_H","MSigDB_C2_CP",
                                    "MSigDB_C5_GO_BP")
        
        if ( collection == "pathprint" ) {
            if ( adj_overlap ) {
                data(pathCor_pathprint_v1.2.3_dframe, envir = environment())
                matrix <- pathCor_pathprint_v1.2.3_dframe
            }
            else {
                data(pathCor_pathprint_v1.2.3_unadjusted_dframe, 
                    envir = environment())
                matrix <- pathCor_pathprint_v1.2.3_unadjusted_dframe 
            }
            
            data(pathprint.Hs.gs, envir = environment())
            names <- names(pathprint.Hs.gs)
        }
        if ( collection == "MSigDB_H" ) {
            if ( adj_overlap ) {
                data(pathCor_Hv5.1_dframe, envir = environment())
                matrix <- pathCor_Hv5.1_dframe
            }
            else {
                data(pathCor_Hv5.1_unadjusted_dframe, envir = environment())
                matrix <- pathCor_Hv5.1_unadjusted_dframe 
            }
                
            data(h_gs_v5.1, envir = environment())
            names <- names(h_gs_v5.1)
        }
        if ( collection == "MSigDB_C2_CP" ) {
            if ( adj_overlap ) {
                data(pathCor_CPv5.1_dframe, envir = environment())
                matrix <- pathCor_CPv5.1_dframe
            }
            else {
                data(pathCor_CPv5.1_unadjusted_dframe, envir = environment())
                matrix <- pathCor_CPv5.1_unadjusted_dframe 
            }
            data(cp_gs_v5.1, envir = environment())
            names <- names(cp_gs_v5.1)
        }
        if ( collection == "MSigDB_C5_GO_BP" ) {
            if ( adj_overlap ) {
                data(pathCor_GOBPv5.1_dframe, envir = environment())
                matrix <- pathCor_GOBPv5.1_dframe
            }
            else {
                data(pathCor_GOBPv5.1_unadjusted_dframe, envir = environment())
                matrix <- pathCor_GOBPv5.1_unadjusted_dframe 
            }
            data(gobp_gs_v5.1.rda, envir = environment())
            names <- names(gobp_gs_v5.1)
        }
        
        # Checking if phenotype_genesets are correctly spelled
        commons_0 <- intersect(phenotype_0_genesets, names)
        difs_0 <- c()
        ph_0 <- c()
        if ( length(commons_0) != length(phenotype_0_genesets)) {
            correct_genesets_flag_1 <- FALSE
            difs_0 <- setdiff(phenotype_0_genesets,commons_0)
            ph_0 <- rep(0, length(difs_0))
        }
        commons_1 <- intersect(phenotype_1_genesets, names)
        difs_1 <- c()
        ph_1 <- c()
        if ( length(commons_1) != length(phenotype_1_genesets)) {
            correct_genesets_flag_2 <- FALSE
            difs_1 <- setdiff(phenotype_1_genesets,commons_1)
            ph_1 <- rep(1, length(difs_1))
        }
        difs <- c(difs_0,difs_1)
        phs <- c(ph_0,ph_1)
        
        # stop if wrong collection inserted or either phenotype genesets do not
        # match the collection
        if(!(collection %in% acceptable_collections))
            stop( "Invalid collection name, please choose between: pathprint,
            MSigDB_H, MSigDB_C2_CP or MSigDB_C5_GO_BP")
        if((correct_genesets_flag_1 == FALSE) || 
            (correct_genesets_flag_2 == FALSE))
            stop( paste("Phenotype ",phs," geneset \"",difs,
                        "\" does not belong to the ",collection ,
                        " collection\n  ", sep = ""))
        
        # find un-used genesets and create the placeholder data.frame
        # for their correlation scores
        used_genesets <- c(phenotype_0_genesets,phenotype_1_genesets)
        unused_genesets <- setdiff(names,used_genesets)
        
        candidate_genesets <- data.frame(unused_genesets)
        candidate_genesets$score <- rep(0,nrow(candidate_genesets))
        
        # step 1: Filtering on absolute correlation (>=) and p-value (<=)
        step1_matrix <-
            subset(matrix, abs(PathCor) >= min_abs_corr & p.value <= max_pval)
        
        # generate score for each un-used geneset and find the top correlated
        for(i in 1:length(unused_genesets)) {
            temp_cor <- subset(step1_matrix,(Pathway.A == unused_genesets[i]
                        | Pathway.B == unused_genesets[i]))
            temp_cor2 <- subset(temp_cor, (Pathway.A %in% used_genesets |
                                                Pathway.B %in% used_genesets))
            correlation_vector <- temp_cor2[3]
            
            if (dim(correlation_vector)[1] != 0) {
                # transpose vector
                correlation_vector <- unname(t(correlation_vector))
                
                max_position <- which.max(abs(as.numeric(
                    unlist(correlation_vector))))
                max <- correlation_vector[max_position]
                candidate_genesets[i,2] <- max
            }
        }
        
        candidate_genesets<- candidate_genesets[
            with(candidate_genesets, order(-abs(candidate_genesets$score))),]
        top_correlated_genesets <- candidate_genesets[,"unused_genesets"][1:top]
        
        # create matrix with geneset groups
        info <- list(phenotype_0_genesets, phenotype_1_genesets,
                    as.vector(top_correlated_genesets))
        names(info) <- c("phenotype_0_genesets", "phenotype_1_genesets",
                            "top_correlated_genesets")
        
        # step 2: Find the possible pairs between phenotype_0_genesets,
        # phenotype_1_genesets and correlated genesets.
        interesting_genesets <- c(as.list(used_genesets),
                                    as.vector(top_correlated_genesets))
        
        step2_matrix <-
            subset(step1_matrix, Pathway.A %in% interesting_genesets &
                        Pathway.B %in% interesting_genesets)
        
        if(length(phenotype_1_genesets) == 0)
            print(paste("Successful analyzing: Based on phenotype 0 [",
                        paste(phenotype_0_genesets, collapse=', ' ),"] and ",
                        top, " top correlated genesets, ", dim(step2_matrix)[1],
                        " correlation pairs were found.", sep =""))
        else
            print(paste("Successful analyzing: Based on phenotype 0 [",
                        paste(phenotype_0_genesets, collapse=', ' ),
                        "], phenotype 1 [",
                        paste(phenotype_1_genesets, collapse=', ' ),"] and ",
                        top, " top correlated genesets, ", dim(step2_matrix)[1],
                        " correlation pairs were found.", sep =""))
        
        po = new("pcxn",type = "pcxn_analyze", data = as.matrix(step2_matrix),
                geneset_groups = as.list(info))
        
        return(po)
    }
hidelab/pcxn documentation built on May 21, 2019, 3:03 a.m.