R/pcxn_analyze.R
In pcxn: Exploring, analyzing and visualizing functions utilizing the pcxnData package

Documented in pcxn_analyze

# FUNCTION
# - arguments: collection,2 genesets (1 per phenotype), top correlated genesets
# - returns: data.frame which contains the pairs between the phenotype_genesets
# and their top corralated genesets after filtering. Results are sorted in
# descending correlation order

# NOTES
# - Collection: only accepting pathprint,MSigDB_H,MSigDB_C2_CP,MSigDB_C5_GO_BP
# - Phenotype_genesets: must belong to the collection and be spelled exactly
# right!
# - Do we need a search feature that allows the user to search for genesets?
# - If either of the above does not hold, the function specifies which geneset/s
# do not belong to the collection and also states their phenotype
# - The initial data.frame is filtered by minimum absolute correlation and
# maximum p-value

utils::globalVariables(c("pathprint.Hs.gs","pheatmap",
                        "pathCor_pathprint_v1.2.3_dframe",
                        "pathCor_CPv5.1_dframe", "gobp_gs_v5.1",
                        "PathCor", "p.value", "Pathway.A", 
                        "Pathway.B"))

#' Discover correlation relationships among multiple pathways/gene sets
#' identified by GSEA (gene set enrichment analysis). All the input
#' pathways/gene sets should come from the same collection. MSigDB H hallmark
#' gene sets, MSigDB C2 Canonical pathways, MSigDB C5 GO BP gene sets,
#' and Pathprint are treated as four separate collections.
#'
#' @param collection pathway's collection
#' @param phenotype_0_genesets genesets/pathways of the first group of pathways
#' @param phenotype_1_genesets genesets/pathways of the second group of pathways
#' @param adj_overlap whether the correlation coefficients are adjusted for gene
#' overlap
#' @param top most correlated genesets/pathways
#' @param min_abs_corr minimum absolute correlation
#' @param max_pval maximum p-value
#'
#' @return pcxn object
#' @export
#'
#' @examples
#' \dontrun{
#'  pcxn_analyze("pathprint",c("ABC transporters (KEGG)",
#'  "ACE Inhibitor Pathway (Wikipathways)","AR down reg. targets (Netpath)"),
#'   adj_overlap = TRUE, c("DNA Repair (Reactome)"), 10, 0.05, 0.05)
#' }

pcxn_analyze <-
    function(collection = c("pathprint","MSigDB_H","MSigDB_C2_CP",
                            "MSigDB_C5_GO_BP"),
                phenotype_0_genesets = NULL,phenotype_1_genesets = NULL,
                adj_overlap = FALSE,
                top = 10,
                min_abs_corr = 0.05,
                max_pval = 0.05 ) {
        
        if(missing(phenotype_0_genesets) && missing(phenotype_1_genesets))
            stop( "Please insert phenotype_0_genesets and/or 
                    phenotype_1_genesets arguments")
        
        pathprint.Hs.gs.rda <- NULL
        pathprint.Hs.gs <- NULL
        h_gs_v5.1 <- NULL
        cp_gs_v5.1 <- NULL
        gobp_gs_v5.1.rda <- NULL
        
        pathCor_Hv5.1_dframe <- NULL
        pathCor_GOBPv5.1_dframe <- NULL
        pathCor_CPv5.1_dframe.rda  <- NULL
        pathCor_Hv5.1_unadjusted_dframe <- NULL
        pathCor_GOBPv5.1_unadjusted_dframe <- NULL
        pathCor_CPv5.1_unadjusted_dframe.rda  <- NULL
        
        # loading the datasets, when the package is complete  use data(...)
        correct_genesets_flag_1 <- TRUE
        correct_genesets_flag_2 <- TRUE
        
        acceptable_collections = c("pathprint","MSigDB_H","MSigDB_C2_CP",
                                    "MSigDB_C5_GO_BP")
        
        if ( collection == "pathprint" ) {
            if ( adj_overlap ) {
                data(pathCor_pathprint_v1.2.3_dframe, envir = environment())
                matrix <- pathCor_pathprint_v1.2.3_dframe
            }
            else {
                data(pathCor_pathprint_v1.2.3_unadjusted_dframe, 
                    envir = environment())
                matrix <- pathCor_pathprint_v1.2.3_unadjusted_dframe 
            }
            
            data(pathprint.Hs.gs, envir = environment())
            names <- names(pathprint.Hs.gs)
        }
        if ( collection == "MSigDB_H" ) {
            if ( adj_overlap ) {
                data(pathCor_Hv5.1_dframe, envir = environment())
                matrix <- pathCor_Hv5.1_dframe
            }
            else {
                data(pathCor_Hv5.1_unadjusted_dframe, envir = environment())
                matrix <- pathCor_Hv5.1_unadjusted_dframe 
            }
                
            data(h_gs_v5.1, envir = environment())
            names <- names(h_gs_v5.1)
        }
        if ( collection == "MSigDB_C2_CP" ) {
            if ( adj_overlap ) {
                data(pathCor_CPv5.1_dframe, envir = environment())
                matrix <- pathCor_CPv5.1_dframe
            }
            else {
                data(pathCor_CPv5.1_unadjusted_dframe, envir = environment())
                matrix <- pathCor_CPv5.1_unadjusted_dframe 
            }
            data(cp_gs_v5.1, envir = environment())
            names <- names(cp_gs_v5.1)
        }
        if ( collection == "MSigDB_C5_GO_BP" ) {
            if ( adj_overlap ) {
                data(pathCor_GOBPv5.1_dframe, envir = environment())
                matrix <- pathCor_GOBPv5.1_dframe
            }
            else {
                data(pathCor_GOBPv5.1_unadjusted_dframe, envir = environment())
                matrix <- pathCor_GOBPv5.1_unadjusted_dframe 
            }
            data(gobp_gs_v5.1.rda, envir = environment())
            names <- names(gobp_gs_v5.1)
        }
        
        # Checking if phenotype_genesets are correctly spelled
        commons_0 <- intersect(phenotype_0_genesets, names)
        difs_0 <- c()
        ph_0 <- c()
        if ( length(commons_0) != length(phenotype_0_genesets)) {
            correct_genesets_flag_1 <- FALSE
            difs_0 <- setdiff(phenotype_0_genesets,commons_0)
            ph_0 <- rep(0, length(difs_0))
        }
        commons_1 <- intersect(phenotype_1_genesets, names)
        difs_1 <- c()
        ph_1 <- c()
        if ( length(commons_1) != length(phenotype_1_genesets)) {
            correct_genesets_flag_2 <- FALSE
            difs_1 <- setdiff(phenotype_1_genesets,commons_1)
            ph_1 <- rep(1, length(difs_1))
        }
        difs <- c(difs_0,difs_1)
        phs <- c(ph_0,ph_1)
        
        # stop if wrong collection inserted or either phenotype genesets do not
        # match the collection
        if(!(collection %in% acceptable_collections))
            stop( "Invalid collection name, please choose between: pathprint,
            MSigDB_H, MSigDB_C2_CP or MSigDB_C5_GO_BP")
        if((correct_genesets_flag_1 == FALSE) || 
            (correct_genesets_flag_2 == FALSE))
            stop( paste("Phenotype ",phs," geneset \"",difs,
                        "\" does not belong to the ",collection ,
                        " collection\n  ", sep = ""))
        
        # find un-used genesets and create the placeholder data.frame
        # for their correlation scores
        used_genesets <- c(phenotype_0_genesets,phenotype_1_genesets)
        unused_genesets <- setdiff(names,used_genesets)
        
        candidate_genesets <- data.frame(unused_genesets)
        candidate_genesets$score <- rep(0,nrow(candidate_genesets))
        
        # step 1: Filtering on absolute correlation (>=) and p-value (<=)
        step1_matrix <-
            subset(matrix, abs(PathCor) >= min_abs_corr & p.value <= max_pval)
        
        # generate score for each un-used geneset and find the top correlated
        for(i in 1:length(unused_genesets)) {
            temp_cor <- subset(step1_matrix,(Pathway.A == unused_genesets[i]
                        | Pathway.B == unused_genesets[i]))
            temp_cor2 <- subset(temp_cor, (Pathway.A %in% used_genesets |
                                                Pathway.B %in% used_genesets))
            correlation_vector <- temp_cor2[3]
            
            if (dim(correlation_vector)[1] != 0) {
                # transpose vector
                correlation_vector <- unname(t(correlation_vector))
                
                max_position <- which.max(abs(as.numeric(
                    unlist(correlation_vector))))
                max <- correlation_vector[max_position]
                candidate_genesets[i,2] <- max
            }
        }
        
        candidate_genesets<- candidate_genesets[
            with(candidate_genesets, order(-abs(candidate_genesets$score))),]
        top_correlated_genesets <- candidate_genesets[,"unused_genesets"][1:top]
        
        # create matrix with geneset groups
        info <- list(phenotype_0_genesets, phenotype_1_genesets,
                    as.vector(top_correlated_genesets))
        names(info) <- c("phenotype_0_genesets", "phenotype_1_genesets",
                            "top_correlated_genesets")
        
        # step 2: Find the possible pairs between phenotype_0_genesets,
        # phenotype_1_genesets and correlated genesets.
        interesting_genesets <- c(as.list(used_genesets),
                                    as.vector(top_correlated_genesets))
        
        step2_matrix <-
            subset(step1_matrix, Pathway.A %in% interesting_genesets &
                        Pathway.B %in% interesting_genesets)
        
        if(length(phenotype_1_genesets) == 0)
            print(paste("Successful analyzing: Based on phenotype 0 [",
                        paste(phenotype_0_genesets, collapse=', ' ),"] and ",
                        top, " top correlated genesets, ", dim(step2_matrix)[1],
                        " correlation pairs were found.", sep =""))
        else
            print(paste("Successful analyzing: Based on phenotype 0 [",
                        paste(phenotype_0_genesets, collapse=', ' ),
                        "], phenotype 1 [",
                        paste(phenotype_1_genesets, collapse=', ' ),"] and ",
                        top, " top correlated genesets, ", dim(step2_matrix)[1],
                        " correlation pairs were found.", sep =""))
        
        po = new("pcxn",type = "pcxn_analyze", data = as.matrix(step2_matrix),
                geneset_groups = as.list(info))
        
        return(po)
    }

Any scripts or data that you put into this service are public.

pcxn documentation built on Nov. 8, 2020, 10:58 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

pcxn
Exploring, analyzing and visualizing functions utilizing the pcxnData package

R/pcxn_analyze.R
In pcxn: Exploring, analyzing and visualizing functions utilizing the pcxnData package

Defines functions pcxn_analyze

Documented in pcxn_analyze

Try the pcxn package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

pcxn Exploring, analyzing and visualizing functions utilizing the pcxnData package

R/pcxn_analyze.R In pcxn: Exploring, analyzing and visualizing functions utilizing the pcxnData package

Defines functions pcxn_analyze

Documented in pcxn_analyze

Try the pcxn package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

pcxn
Exploring, analyzing and visualizing functions utilizing the pcxnData package

R/pcxn_analyze.R
In pcxn: Exploring, analyzing and visualizing functions utilizing the pcxnData package