R/pcxn_explore.R

# FUNCTION
# - arguments: collection, query_geneset, top correlated query_genesets
# - returns: data.frame which contains the pairs between the query_geneset
# and it's top corralated genesets after filtering. Results are sorted
# in descending correlation order

# NOTES
# - Collection: only accepting pathprint,MSigDB_H,MSigDB_C2_CP,MSigDB_C5_GO_BP
# - query_geneset: must belong to the collection and be spelled exactly right!
# (do we need a search feature that allows the user to search
# for query_genesets?)
# - The initial data.frame is filtered by minimum absolute correlation
# and maximum p-value

utils::globalVariables(c("pathprint.Hs.gs", "pathCor_Hv5.1_dframe", 
                        "pathCor_Hv5.1_unadjusted_dframe",
                        "h_gs_v5.1","cp_gs_v5.1", "cp_gs_v5.1",
                        "pathCor_GOBPv5.1_dframe",
                        "pathCor_GOBPv5.1_unadjusted_dframe"))

#' Select a single pathway/gene set from one of the four collections ( MSigDB H
#' hallmark gene sets, MSigDB C2 Canonical pathways, MSigDB C5 GO BP gene sets,
#' and Pathprint ) and discover its correlated pathway/gene sets within the
#' same collection.
#'
#' @param collection pathways' collection
#' @param query_geneset the single pathway of interest
#' @param adj_overlap whether the correlation coefficients are adjusted for gene
#' overlap
#' @param top most correlated genesets/pathways
#' @param min_abs_corr minimum absolute correlation
#' @param max_pval maximum p-value
#'
#' @return pcxn object
#' @export
#'
#' @examples
#' \dontrun{
#'  pcxn<- pcxn_explore("pathprint","Alzheimer's disease (KEGG)",
#'  adj_overlap = TRUE, 10, 0.05, 0.05)
#' }

pcxn_explore <- function(collection = c("pathprint","MSigDB_H","MSigDB_C2_CP",
                                        "MSigDB_C5_GO_BP"),
                        query_geneset,
                        adj_overlap = FALSE,
                        top = 10,
                        min_abs_corr = 0.05,
                        max_pval = 0.05) {
    
    if(missing(query_geneset))
        stop( "Please insert query_geneset argument")
    
    # loading the datasets
    correct_query_geneset_flag <- FALSE
    acceptable_collections = c("pathprint","MSigDB_H","MSigDB_C2_CP",
                                "MSigDB_C5_GO_BP")
    if ( collection == "pathprint" ) {
        if ( adj_overlap ) {
            data(pathCor_pathprint_v1.2.3_dframe, envir = environment())
            matrix <- pathCor_pathprint_v1.2.3_dframe 
        }
        else {
            data(pathCor_pathprint_v1.2.3_unadjusted_dframe, 
                envir = environment())
            matrix <- pathCor_pathprint_v1.2.3_unadjusted_dframe 
        }

        data(pathprint.Hs.gs, envir = environment())
        if (  query_geneset %in% names(pathprint.Hs.gs) )
            correct_query_geneset_flag <- TRUE
    }
    if ( collection == "MSigDB_H" ) {
        if ( adj_overlap ) {
            data(pathCor_Hv5.1_dframe, envir = environment())
            matrix <- pathCor_Hv5.1_dframe
        }
        else {
            data(pathCor_Hv5.1_unadjusted_dframe, envir = environment())
            matrix <- pathCor_Hv5.1_unadjusted_dframe 
        }
        
        tryCatch({
            data(h_gs_v5.1.rda, envir = environment())
        }, warning = function(w) {
        }, error = function(e) {
        }, finally = {
            data('h_gs_v5.1', envir = environment())
        })
        
        if ( query_geneset %in% names(h_gs_v5.1) )
            correct_query_geneset_flag <- TRUE
    }
    if ( collection == "MSigDB_C2_CP" ) {
        if ( adj_overlap ) {
            data(pathCor_CPv5.1_dframe, envir = environment())
            matrix <- pathCor_CPv5.1_dframe
        }
        else {
            data(pathCor_CPv5.1_unadjusted_dframe, envir = environment())
            matrix <- pathCor_CPv5.1_unadjusted_dframe 
        }
        
        tryCatch({
            data(cp_gs_v5.1.rda, envir = environment())
        }, warning = function(w) {
        }, error = function(e) {
        }, finally = {
            data('cp_gs_v5.1', envir = environment())
        })
        
        if (  query_geneset %in% names(cp_gs_v5.1) )
            correct_query_geneset_flag <- TRUE
    }
    if ( collection == "MSigDB_C5_GO_BP" ) {
        if ( adj_overlap ) {
            data(pathCor_GOBPv5.1_dframe, envir = environment())
            matrix <- pathCor_GOBPv5.1_dframe
        }
        else {
            data(pathCor_GOBPv5.1_unadjusted_dframe, envir = environment())
            matrix <- pathCor_GOBPv5.1_unadjusted_dframe 
        }
        
        tryCatch({
            data(gobp_gs_v5.1.rda, envir = environment())
        }, warning = function(w) {
        }, error = function(e) {
        }, finally = {
            data('gobp_gs_v5.1', envir = environment())
        })
        
        if ( query_geneset %in% names(gobp_gs_v5.1) )
            correct_query_geneset_flag <- TRUE
    }
    
    # stop if wrong collection inserted or query_geneset
    # does not match the collection
    if(!(collection %in% acceptable_collections))
        stop( "Invalid collection name, please choose between: pathprint,
            MSigDB_H, MSigDB_C2_CP or MSigDB_C5_GO_BP")
    if(!(correct_query_geneset_flag))
        stop( "query_geneset does not belong to the collection")
    
    # step 1: Find occurences of the query_geneset in 1st or 2nd column.
    # Basically find every pathway connected to the query pathway and
    # sort them by correlation
    m1 <- subset(matrix,Pathway.A == query_geneset)
    m2 <- subset(matrix,Pathway.B == query_geneset)
    step1_matrix <- as.data.frame(rbind(m1,m2))
    top_step1_matrix<- 
        (step1_matrix[order(-abs(step1_matrix$PathCor)),])[1:top,]
    
    # Most correlated genesets + query
    interesting_genesets <- unique(as.list(c(top_step1_matrix$Pathway.A,
                                            top_step1_matrix$Pathway.B)))
    
    # create matrix with geneset groups
    info <- list(query_geneset)
    names(info) <- "query_geneset"
    
    # Step 2: Extracting only the pairwise combinations of our query geneset
    # and the top correlated
    step2_matrix <- subset(matrix, Pathway.A %in% interesting_genesets &
                                Pathway.B %in% interesting_genesets)
    
    # step 3: Filtering on absolute correlation (>=) and p-value (<=)
    step3_matrix <- subset(step2_matrix, abs(PathCor) >= min_abs_corr &
                                p.value <= max_pval)
    
    print(paste("Successful exploring: Based on ", query_geneset ," and ",top,
                " top correlated genesets, ", dim(step3_matrix)[1],
                " correlation pairs were found.", sep =""))
    
    po = new("pcxn",type = "pcxn_explore", data = as.matrix(step3_matrix),
            geneset_groups = as.list(info))
    
    return(po)
}
hidelab/pcxn documentation built on May 21, 2019, 3:03 a.m.