gt23: Gene therapy integration analysis tool suite.

Documented in scanStats tuneScanStatistics

#' Run geneRxCluster scan statistics.
#' Bioinformatics. 2014 Jun 1;30(11):1493-500. 
#' 
#' @param  kvals: The interpretation of kvals = 2L:5L is that genomic windows will be drawn for every consecutive groups of integration sites with group sizes of 2, 3, 4, and 5 integration sites. 
#' @return Data frame of identified clusters with counts and statistics where target.min is the smallest nominal False Discoveries Expected for each cluster.  
#'
#' @export
scanStats <- function(gr1, gr2, gr1.label = 'A', gr2.label = 'B', kvals = '15L:30L', nperm = 100,
                      cutpt.tail.expr = 'critVal.target(k, n, target = 5, posdiff = x)',
                      cutpt.filter.expr = 'as.double(apply(x, 2, median, na.rm = TRUE))'
                      #cutpt.filter.expr = paste0('apply(x, 2, quantile, probs = ', p, ', na.rm = TRUE)')
                      ){
  library(GenomicRanges)
  library(geneRxCluster)
  
  gr1$clusterSource <- TRUE
  gr2$clusterSource <- FALSE
  
  gr3 <- c(gr1, gr2)
  
  df <- GenomicRanges::as.data.frame(gr3)
  df <- df[order(df$seqnames, df$start),]
  df$seqnames <- droplevels(df$seqnames)
  row.names(df) <- NULL
  df <- df[, c('seqnames', 'start', 'clusterSource')]
  
  comm <- paste0('gRxCluster(df$seqnames, df$start, df$clusterSource, ',  kvals, ', nperm=', nperm, ', cutpt.tail.expr=', cutpt.tail.expr, ', cutpt.filter.expr=', cutpt.filter.expr, ')')
  scan <- eval(parse(text = comm))
  
  if(length(scan)==0) return(GRanges())
  
  gr1.overlap <- GenomicRanges::findOverlaps(gr1, scan, ignore.strand=TRUE)
  gr2.overlap <- GenomicRanges::findOverlaps(gr2, scan, ignore.strand=TRUE)
  
  if(length(gr1.overlap) > 0 & length(gr2.overlap) == 0){
    scan$clusterSource <- gr1.label
    return(scan)
  }
  
  if(length(gr1.overlap) == 0 & length(gr2.overlap) > 0){
    scan$clusterSource <- gr2.label
    return(scan)
  }
  
  gr1.overlap <- stats::aggregate(queryHits ~ subjectHits, data=as.data.frame(gr1.overlap), FUN=length)
  gr2.overlap <- stats::aggregate(queryHits ~ subjectHits, data=as.data.frame(gr2.overlap), FUN=length)
  
  gr1.list <- list()
  gr1.list[gr1.overlap[,1]] <- gr1.overlap[,2]
  
  gr2.list <- list()
  gr2.list[gr2.overlap[,1]] <- gr2.overlap[,2]
  
  # index error protection
  gr1.list[seq(length(gr1.list)+1, (length(gr1.list) + length(scan)-length(gr1.list) + 1))] <- 0
  gr2.list[seq(length(gr2.list)+1, (length(gr2.list) + length(scan)-length(gr2.list) + 1))] <- 0
  
  scan$clusterSource <- '?'
  
  for(i in 1:length(scan))
  {
    if(! is.null(gr1.list[[i]]) & is.null(gr2.list[[i]])){
      scan[i]$clusterSource <- gr1.label
    } else if (is.null(gr1.list[[i]]) & ! is.null(gr2.list[[i]])){
      scan[i]$clusterSource <- gr2.label
    } else if (gr1.list[[i]] > gr2.list[[i]]){
      scan[i]$clusterSource <- gr1.label
    } else {
      scan[i]$clusterSource <- gr2.label
    }
  }
  
  scan
}


#' Run a number of combination of settings for geneRxCluster scan statistics.
#' Bioinformatics. 2014 Jun 1;30(11):1493-500. 
#' 
#' @return Data frame of clusters statistics from each combination of paramteres tested. 
#'
#' @export
tuneScanStatistics <- function(A, B, CPUs=25, 
                               minWindow=5, maxWindow=100, windowStep=5, 
                               minProb=0.75, maxProb=0.95, probStep=0.025, 
                               minTarget=5, maxTarget=100, targetStep=5){
  
  t <- do.call(rbind, lapply(minWindow:(maxWindow - minWindow), function(i){
    r <- data.frame()
    
    for(i2 in (minWindow+windowStep):maxWindow){
      if(i2-i>=windowStep){
        for(p in seq(minProb, maxProb, by=probStep)){
          for(t in seq(minTarget, maxTarget, by=targetStep)){
            kvals <- paste0(i, 'L:', i2, 'L')
            cutpt.filter.expr <- paste0('apply(x, 2, quantile, probs = ', p, ', na.rm = TRUE)')
            cutpt.tail.expr   <- paste0('critVal.target(k, n, target = ', t, ', posdiff = x)')
            
            r <- rbind(r, data.frame(kvals=kvals,
                                     cutpt.filter.expr=cutpt.filter.expr,
                                     cutpt.tail.expr=cutpt.tail.expr))
          }
        }
      }
    }
    
    r
  }))
  
  t$s <- ceiling(seq_along(1:nrow(t))/(nrow(t)/CPUs))
  save(list = ls(all.names = TRUE), file='tuneData.RData', envir = environment())

  cluster <- parallel::makeCluster(CPUs)
  t2 <- do.call(rbind, parallel::parLapply(cluster, split(t, t$s), function(x){
  #t2 <- do.call(rbind, lapply(split(t, t$s), function(x){  
    load('tuneData.RData')
    
    do.call(rbind, lapply(1:nrow(x), function(i){
      message('chunk ', x$s[1], ' parameter row ', i)
      
      tryCatch({
                  scan <- gt23::scanStats(A, B,
                                          kvals=x[i,]$kvals, 
                                          nperm='1000',
                                          cutpt.filter.expr=x[i,]$cutpt.filter.expr,
                                          cutpt.tail.expr=x[i,]$cutpt.tail.expr)
                         
                  data.frame(kvals=x[i,]$kvals,
                             cutpt.filter.expr=x[i,]$cutpt.filter.expr,
                             cutpt.tail.expr=x[i,]$cutpt.tail.expr,
                             FDR=gRxSummary(scan)$FDR,
                             totalClusters=gRxSummary(scan)$Clusters_Discovered,
                             Aclusters=length(subset(scan, clusterSource=='A')),
                             Bclusters=length(subset(scan, clusterSource=='B')),
                             avgClusterWidth=(sum(width(scan)) / length(scan)))
               }, 
               error = function(cond) {
                         return(data.frame())
               })
    }))
  }))
  stopCluster(cluster)
  t2
}