R/makePSN_RangeSets.R

Defines functions makePSN_RangeSets

Documented in makePSN_RangeSets

#' Create patient similarity interaction networks based on range sets
#'
#' @details Creates patient similarity networks when data consist of 
#' genomic events associated with patients. Examples include CNV or 
#' indel data for patients. To generate networks from full matrices such
#' gene expression data, use \code{makePSN_NamedMatrix} instead.
#' Genomic ranges corresponding to events in patients (gr) should be named.
#' One network is created per named range set (rangeSet). Each set
#' reflects a group of related loci ; for example, genomic ranges associated
#' with genes in the same cellular pathway. 
#' Currently, the only similarity measure supported is binary; two patients
#' are related in a network N if they both overlap elements of set N.
#' @param gr (GRanges) patient ranges. Metadata should contain:
#' ID: (char) unique patient ID
#' LOCUS_NAME: (comma-separated char) named ranges overlapped
#' @param rangeSet (list) list of GRanges, one entry per range set.
#' Key is the name of the range set, and value is a GRanges object with
#' corresponding ranges
#' @param netDir (char) path to directory where networks should be written
#' @param simMetric (char) Similarity metric. Currently only 'coincide' 
#' is supported; two patients share an edge if they overlap elements in the
#' the same gene set. E.g. Two patients with CNVs that overlap different
#' genes of the same pathway would be related, but patients overlapping
#' genes that don't share a pathway (or, more accurately, a named-set 
#' grouping) would not be related. The edge weight is therefore binary.
#' @param verbose (logical) print detailed messages
#' @param numCores (integer) num cores for parallel processing
#' @param quorum (integer) minimum number of patients in a network for the 
#' network to be constructed
#' @return Vector of network filenames
#' @examples
#' data(pathway_GR,cnv_GR)
#' ### # example commented out to avoid build errors because of parallel
#' ### # execution. Uncomment to run.
#' ### netList <- makePSN_RangeSets(cnv_GR,pathway_GR,'.')
#' @export
#' @import GenomicRanges
#' @importFrom utils write.table
#' @import bigmemory
#' @import foreach
#' @import parallel
#' @import doParallel
#' @importFrom combinat combn
makePSN_RangeSets <- function(gr, rangeSet, netDir = tempdir(), 
		simMetric = "coincide", 
    quorum = 2L, verbose = TRUE, numCores = 1L) {
    if (!file.exists(netDir)) 
        dir.create(netDir)
    
    TEST_MODE <-FALSE # for debugging
	if (TEST_MODE) verbose <- TRUE
    
    # num patients per network
    netCountFile <- paste(netDir,"patient_count.txt",sep=getFileSep())
    # IDs of patients with 1+ interaction in set of networks
    incPatientFile <- paste(netDir,"inc_patients.txt",sep=getFileSep()) 
    
    uq_loci <- unique(unlist(lapply(rangeSet, function(x) {
        x$name
    })))
    uq_patients <- unique(gr$ID)
    
    if (!simMetric %in% "coincide") 
        stop("Only value supported for simMetric is 'coincide'")
    
    # LOCUS_NAMES not provided? Compute these
    if (!"LOCUS_NAMES" %in% names(elementMetadata(gr))) {
        message(paste("\tLOCUS_NAMES column not provided; computing ", 
	"overlap of patients\t\twith regions", 
            sep = ""))
        gr <- getRegionOL(gr, rangeSet)
    }
    
    # set up a matrix of patient by locus.  a[i,j] = 1 if patient i has a CNV
    # affecting locus j else 0
    message("* Preparing patient-locus matrix\n")
    message(sprintf("\t%i unique patients, %i unique locus symbols\n", 
				length(uq_patients), 
        length(uq_loci)))

	bkFile <- paste(tempdir(),"pgmat.bk",sep=getFileSep())
	descFile <- paste(tempdir(),"pgmat.desc",sep=getFileSep())
	if (file.exists(bkFile)) unlink(bkFile)
	if (file.exists(descFile)) unlink(descFile)

	    pgMat <- big.matrix(nrow = length(uq_patients), 
				ncol = length(uq_loci), type = "integer",
				backingpath=tempdir(),
				backingfile="pgmat.bk",
				descriptorfile="pgmat.desc")
	    pgDesc <- describe(pgMat)
    
    	cl <- makeCluster(numCores, outfile = "")
    	registerDoParallel(cl)
    
    num <- length(uq_patients)
    ckSize <- 50
    t0 <- Sys.time()
    x <- foreach(spos = seq(1, num, ckSize)) %dopar% {
        inner_mat <- bigmemory::attach.big.matrix(pgDesc)
        epos <- spos + (ckSize - 1)
        for (k in spos:epos) {
            idx <- which(gr$ID %in% uq_patients[k])
            myloci <- unlist(strsplit(gr$LOCUS_NAMES[idx], ","))
            myloci <- setdiff(myloci, "")
            
            if (length(myloci) > 0) {
                inner_mat[k, which(uq_loci %in% myloci)] <- 1L
            }
            if (k%%100 == 0) 
                message(".")
        }
    }
    t1 <- Sys.time()
    print(t1 - t0)
    
    hit_p <- integer(length(rangeSet))
    
    # binary vector indicating if this network was included inc_status <-
    # integer(length(rangeSet)) patients that have at least one 
		# interaction in one network
    inc_patients <- integer(length(uq_patients))
    names(inc_patients) <- uq_patients
    
    # now group set-by-set
    message("* Writing networks\n")
	`%myinfix%` <- ifelse(TEST_MODE, `%do%`, `%dopar%`)
    t0 <- Sys.time()
    outFiles <- foreach(idx = seq_len(length(rangeSet))) %myinfix% {
        curP <- names(rangeSet)[idx]
        if (verbose) 
            message(sprintf("\t%s: ", curP))
        
        inner_mat <- bigmemory::attach.big.matrix(pgDesc)
        
        locus_idx <- which(uq_loci %in% rangeSet[[idx]]$name)
        if (length(locus_idx) >= 2) {
            hit_pathway <- rowSums(inner_mat[, locus_idx])
        } else {
            # one-locus pathway, probably risk locus
            hit_pathway <- inner_mat[, locus_idx]
        }
        
        hit_p[idx] <- sum(hit_pathway > 0)
        if (verbose) 
            message(sprintf("%i patients with interactions", hit_p[idx]))
        
        pScore <- 1  # similarity score for default binary option
        outFile <- ""
        # pathway included in analysis
        if (hit_p[idx] >= quorum) {
                if (verbose) 
                  message(sprintf("\n\t\tlength=%i; score = %1.2f", 
		length(rangeSet[[idx]]), pScore))
                  x <- hit_pathway
                  inc_patients[x > 0] <- inc_patients[x > 0] + 1
		tmp <- uq_patients[hit_pathway > 0]
                  pat_pairs <- t(combinat::combn(tmp, 2))
                  pat_pairs <- cbind(pat_pairs, pScore)
                  
                  # write network for pathway
                  outFile <- paste(netDir,sprintf("%s_cont.txt",curP),
			sep=getFileSep())
                  write.table(pat_pairs, file = outFile, sep = "\t", 
			col.names = FALSE, 
                	row.names = FALSE, quote = FALSE)
                  outFile <- basename(outFile)
                ## status <- 1;
            }  
        if (idx%%100 == 0) 
            message(".")
        if (verbose) 
            message("\n")
        ## inc_status[idx] <- status
        
        if (!verbose) {
            if (idx%%100 == 0) 
                message(".")
            if (idx%%1000 == 0) 
                message("\n")
        }
        
        outFile
    }
    t1 <- Sys.time()
    print(t1 - t0)
    stopCluster(cl)
	if (file.exists(bkFile)) unlink(bkFile)
	if (file.exists(descFile)) unlink(descFile)
    outFiles <- unlist(outFiles)
    outFiles <- outFiles[which(outFiles != "")]
    
    outFiles
   } 
BaderLab/netDx documentation built on Sept. 26, 2021, 9:13 a.m.