R/xx_deprecated.R

Defines functions makePeakList readCounts calculateCounts calculateBHSUM calculateRPKMs subsampleRPKMs calculateBinomTests plotRPKMs plotBinarized plotBinomTests calculateHyperTests plotHyperTests

### These are historical functions
# Still here for reference, but not exported.

makePeakList <- function(peakDir,peakNames) {

  peakFiles <- list.files(peakDir)

  allPeaks <- list()

  for(i in 1:length(peakFiles)) {

    info <- file.info(paste0(peakDir,peakFiles[i]))
    no_lines <- info$size == 0

    if(no_lines) {
      bedPeaks <- data.frame(chr="chr",start=0,end=0,stringsAsFactors=F)
    } else {
      bedPeaks <- read.table(paste0(peakDir,peakFiles[i]),header=F)
    }
    GRPeaks <- GRanges(     seqnames = bedPeaks[,1],
                            IRanges(start = bedPeaks[,2],
                                    end = bedPeaks[,3]))
    allPeaks <- c(allPeaks,GRPeaks)
    names(allPeaks)[i] <- peakNames[i]

  }

  return(allPeaks)
}


readCounts <- function(bamFiles) {
  counts <- numeric()

  for(b in 1:length(bamFiles)) {

    bam <- import(bamFiles[b],format="bam")
    counts <- c(counts,length(bam))

  }
  return(counts)
}

calculateCounts <- function(peakList,bamFiles) {

  counts <- matrix(nrow=length(peakList),ncol=length(bamFiles))

  for(b in 1:length(bamFiles)) {
    print(paste0("Running ",b," of ",length(bamFiles)))

    bam <- import(bamFiles[b],format="bam")

    for(p in 1:length(peakList)) {
      counts[p,b] <- sum(countOverlaps(bam,peakList[[p]]))

    }
  }

  counts <- as.data.frame(counts)
  names(counts) <- bamFiles
  row.names(counts) <- names(peakList)

  return(counts)

}

calculateBHSUM <- function(peakSet,bamFiles,downsample=NULL,bigmode=F,verbose=F) {

  if(bigmode) {
    peakSet_size <- length(peakSet)
    chunk_size <- 100000
    n_chunks <- floor(peakSet_size / chunk_size)

    bhsum <- matrix(nrow=0,ncol=length(bamFiles))

    for(i in 1:n_chunks) {
      peakSet_chunk <- peakSet[((i - 1) * chunk_size + 1):(i * chunk_size),]

      chunk_bhsum <- matrix(nrow=chunk_size,ncol=length(bamFiles))

      for(b in 1:length(bamFiles)) {

        bam <- import(bamFiles[b],format="bam")

        if(!is.null(downsample)) {
          bam <- sample(bam,downsample)
        }

        bamCounts <- numeric()

        bamCounts <- as.numeric(c(bamCounts,countOverlaps(peakSet_chunk,bam)) > 0)

        chunk_bhsum[,b] <- bamCounts

      }

      chunk_bhsum <- chunk_bhsum[rowSums(chunk_bhsum > 0), ]

      bhsum <- rbind(bhsum,chunk_bhsum)
    }

    # Remaining sites after chunking
    peakSet_chunk <- peakSet[(n_chunks * chunk_size + 1):peakSet_size,]

    chunk_bhsum <- matrix(nrow=chunk_size,ncol=length(bamFiles))

    for(b in 1:length(bamFiles)) {

      bam <- import(bamFiles[b],format="bam")

      if(!is.null(downsample)) {
        ds_n <- downsample[1]
        ds_type <- downsample[2]

        if(ds_type == "reads") {

          bam <- sample(bam,ds_n)

        } else if(ds_type == "fragments") {

          bam_p <- bam[strand(bam) == "+",]
          bam_n <- bam[strand(bam) == "-",]
          bam_f <- GRanges(seqname = seqnames(bam_p),IRanges(start = start(bam_p),end = end(bam_p)))
          bam <- sample(bam,ds_n)

        }
      }

      bamCounts <- numeric()

      bamCounts <- as.numeric(c(bamCounts,countOverlaps(peakSet_chunk,bam)) > 0)

      chunk_bhsum[,b] <- bamCounts

    }

    chunk_bhsum <- chunk_bhsum[rowSums(chunk_bhsum > 0), ]

    bhsum <- rbind(bhsum,chunk_bhsum)

  }  else {

    bhsum <- matrix(nrow=length(peakSet),ncol=length(bamFiles))

    for(b in 1:length(bamFiles)) {

      if(verbose) {
        print(paste("Analyzing",bamFiles[b]))
      }

      bam <- import(bamFiles[b],format="bam")

      if(!is.null(downsample)) {
        bam <- sample(bam,downsample)
      }

      bamCounts <- numeric()

      bamCounts <- as.numeric(c(bamCounts,countOverlaps(peakSet,bam)) > 0)

      bhsum[,b] <- bamCounts

    }

  }
  names(bhsum) <- bamFiles

  return(bhsum)

}


calculateRPKMs <- function(peakList,bamFiles) {

  rpkm <- matrix(nrow=length(peakList),ncol=length(bamFiles))

  for(b in 1:length(bamFiles)) {

    bam <- import(bamFiles[b],format="bam")
    m <- length(bam)/1e6

    for(p in 1:length(peakList)) {

      k <- sum(width(peakList[[p]]))/1e3

      rpkm[p,b] <- sum(countOverlaps(bam,peakList[[p]]))/k/m

    }
  }

  rpkm <- as.data.frame(rpkm)
  names(rpkm) <- bamFiles
  row.names(rpkm) <- names(peakList)

  return(rpkm)
}


subsampleRPKMs <- function(peakList,bamFiles,subcount=100000) {

  rpkm <- matrix(nrow=length(peakList),ncol=length(bamFiles))

  for(b in 1:length(bamFiles)) {

    bam <- import(bamFiles[b],format="bam")

    bam <- sample(bam,subcount)

    m <- length(bam)/1e6

    for(p in 1:length(peakList)) {

      k <- sum(width(peakList[[p]]))/1e3

      rpkm[p,b] <- sum(countOverlaps(bam,peakList[[p]]))/k/m

    }
  }

  rpkm <- as.data.frame(rpkm)
  names(rpkm) <- bamFiles
  row.names(rpkm) <- names(peakList)

  return(rpkm)
}

calculateBinomTests <- function(peakList,bamFiles,bamFilter=".+",correction=T,min_count=0) {

  mm10Size <- 2652783500

  binomPValues <- matrix(nrow=length(peakList),ncol=length(bamFiles))

  for(b in 1:length(bamFiles)) {

    bam <- import(bamFiles[b],format="bam")

    if(length(bam) >= min_count) {

      for(i in 1:length(peakList)) {

        p <- sum(width(peakList[[i]]))/mm10Size
        n <- length(bam)
        x <- sum(countOverlaps(bam,peakList[[i]]))

        binomPValues[i,b] <- binom.test(x,n,p)$p.value

      }
    } else {
      binomPValues[,b] <- rep(1,length(peakList))
    }
  }

  if(correction) {
    binomPValues <- binomPValues * ( ncol(binomPValues) * nrow(binomPValues) )
    binomPValues[binomPValues > 1] <- 1
  }

  binomPValues <- as.data.frame(binomPValues)
  names(binomPValues) <- bamFiles
  row.names(binomPValues) <- names(peakList)

  return(binomPValues)

}


plotRPKMs <- function(rpkm,lowCut=0,highCut=5) {


  rpkmPlotData <- data.frame(val=unlist(data.frame(rpkm)),
                             x=rep(1:ncol(rpkm),each=nrow(rpkm)),
                             yname=rep(rownames(rpkm),ncol(rpkm)),
                             color="white")

  yorder <- data.frame(yname=rownames(rpkm),y=1:length(rownames(rpkm))) %>% arrange(y)

  rpkmPlotData <- rpkmPlotData %>% left_join(yorder)

  colorSet <- colorRampPalette(c("darkblue","orangered"))(1001)
  lowColor <- "white"
  highColor <- "red"

  rpkmPlotData$color[rpkmPlotData$val < lowCut] <- lowColor
  rpkmPlotData$color[rpkmPlotData$val > highCut] <- highColor
  intermediates <- rpkmPlotData$val >= lowCut & rpkmPlotData$val <= highCut
  rpkmPlotData$color[intermediates] <- colorSet[round((rpkmPlotData$val[intermediates]-lowCut)/(highCut-lowCut)*1000,0)+1]


  xlab <- names(rpkm)

  ylab <- yorder$yname

  rpkmPlot <- ggplot(rpkmPlotData) +
    geom_tile(aes(x=x,y=y,fill=color)) +
    scale_fill_identity() +
    scale_y_continuous(labels=ylab,breaks=1:nrow(rpkm),expand=c(0,0),name="peakSet") +
    scale_x_continuous(labels=xlab,breaks=1:ncol(rpkm),expand=c(0,0),name="bamFile") +
    theme_classic() +
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.3))

  return(rpkmPlot)
}

plotBinarized <- function(rpkm) {

  binarized <- matrix(nrow=nrow(rpkm),ncol=ncol(rpkm))

  for(b in 1:ncol(rpkm)) {

    binarized[,b] <- as.numeric(rpkm[,b] == max(rpkm[,b]))

  }

  binarizedPlotData <- data.frame(val=unlist(data.frame(binarized)),
                                  x=rep(1:ncol(rpkm),each=nrow(rpkm)),
                                  yname=rep(rownames(rpkm),ncol(rpkm)))

  yorder <- data.frame(yname=rownames(rpkm),y=1:length(rownames(rpkm))) %>% arrange(y)
  binarizedPlotData <- binarizedPlotData %>% left_join(yorder)

  xlab <- names(rpkm)
  ylab <- yorder$yname

  binarizedPlot <- ggplot(binarizedPlotData) +
    geom_tile(aes(x=x,y=y,fill=val),color="black") +
    scale_fill_gradient2(low="white",mid="white",high="orangered",midpoint=max(binarizedPlotData$val)/2) +
    scale_y_continuous(labels=ylab,breaks=1:nrow(rpkm),expand=c(0,0),name="peakSet") +
    scale_x_continuous(labels=xlab,breaks=1:ncol(rpkm),expand=c(0,0),name="bamFile") +
    theme_classic() +
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.3))

  return(binarizedPlot)
}

plotBinomTests <- function(rpkm,counts=F,lowCut=3,highCut=50) {

  rpkmPlotData <- data.frame(val=-log10(unlist(data.frame(rpkm))),
                             x=rep(1:ncol(rpkm),each=nrow(rpkm)),
                             yname=rep(rownames(rpkm),ncol(rpkm)),
                             color="white")

  yorder <- data.frame(yname=rownames(rpkm),y=1:length(rownames(rpkm))) %>% arrange(y)

  rpkmPlotData <- rpkmPlotData %>% left_join(yorder)

  colorSet <- colorRampPalette(c("darkblue","orangered"))(1001)
  lowColor <- "white"
  highColor <- "red"

  rpkmPlotData$color[rpkmPlotData$val < lowCut] <- lowColor
  rpkmPlotData$color[rpkmPlotData$val > highCut] <- highColor
  intermediates <- rpkmPlotData$val >= lowCut & rpkmPlotData$val <= highCut
  rpkmPlotData$color[intermediates] <- colorSet[round((rpkmPlotData$val[intermediates]-lowCut)/(highCut-lowCut)*1000,0)+1]

  if(counts) {
    xlab <- counts
  } else {
    xlab <- names(rpkm)
  }
  ylab <- yorder$yname

  rpkmPlot <- ggplot(rpkmPlotData) +
    geom_tile(aes(x=x,y=y,fill=color),color="black") +
    scale_fill_identity() +
    scale_y_continuous(labels=ylab,breaks=1:nrow(rpkm),expand=c(0,0),name="peakSet") +
    scale_x_continuous(labels=xlab,breaks=1:ncol(rpkm),expand=c(0,0),name="bamFile") +
    theme_classic() +
    coord_flip() +
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.3))

  return(rpkmPlot)
}


calculateHyperTests <- function(peakList,bamDir,bamFilter=".+",correction=T) {

  n_all_peaks <- sum(unlist(lapply(peakList,length)))

  bamFiles <- list.files(bamDir,bamFilter)

  hyperPValues <- matrix(nrow=length(peakList),ncol=length(bamFiles))

  for(b in 1:length(bamFiles)) {

    bam <- import(paste0(bamDir,bamFiles[b]),format="bam")

    for(i in 1:length(peakList)) {

      q <- sum(countOverlaps(bam,peakList[[i]]))
      m <- length(peakList[[i]])
      n <- n_all_peaks - m
      k <- length(bam)

      hyperPValues[i,b] <- 1 - phyper(q, m, n, k)

    }
  }

  if(correction) {
    hyperPValues <- hyperPValues * ( ncol(hyperPValues) * nrow(hyperPValues) )
    hyperPValues[hyperPValues > 1] <- 1
  }

  hyperPValues <- as.data.frame(hyperPValues)
  names(hyperPValues) <- bamFiles
  row.names(hyperPValues) <- names(peakList)

  return(hyperPValues)

}

plotHyperTests <- function(rpkm,counts=F) {

  rpkmPlotData <- data.frame(val=-log10(unlist(data.frame(rpkm))),
                             x=rep(1:ncol(rpkm),each=nrow(rpkm)),
                             yname=rep(rownames(rpkm),ncol(rpkm)),
                             color="white")

  yorder <- data.frame(yname=rownames(rpkm),y=1:length(rownames(rpkm))) %>% arrange(y)

  rpkmPlotData <- rpkmPlotData %>% left_join(yorder)

  colorSet <- colorRampPalette(c("darkblue","orangered"))(1001)
  lowCut <- 1
  lowColor <- "white"
  highCut <- 25
  highColor <- "red"

  rpkmPlotData$color[rpkmPlotData$val < lowCut] <- lowColor
  rpkmPlotData$color[rpkmPlotData$val > highCut] <- highColor
  intermediates <- rpkmPlotData$val >= lowCut & rpkmPlotData$val <= highCut
  rpkmPlotData$color[intermediates] <- colorSet[round((rpkmPlotData$val[intermediates]-lowCut)/(highCut-lowCut)*1000,0)+1]

  if(counts) {
    xlab <- counts
  } else {
    xlab <- names(rpkm)
  }
  ylab <- yorder$yname

  rpkmPlot <- ggplot(rpkmPlotData) +
    geom_tile(aes(x=x,y=y,fill=color),color="black") +
    scale_fill_identity() +
    scale_y_continuous(labels=ylab,breaks=1:nrow(rpkm),expand=c(0,0),name="peakSet") +
    scale_x_continuous(labels=xlab,breaks=1:ncol(rpkm),expand=c(0,0),name="bamFile") +
    theme_classic() +
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.3))

  return(rpkmPlot)
}


#' # Returns only jd
#' This version is slower.
#'
#' fragment_overlap_jaccard_parallel3 <- function(N) {
#'   ol <- findOverlaps(fragment_list[[N]],
#'                       fragment_list[(N+1):length(fragment_list)])
#'
#'   ol <- table(subjectHits(ol))
#'
#'   n1 <- fragment_lengths[N]
#'   n2 <- fragment_lengths[(N+1):length(fragment_list)]
#'
#'   jd <- 1 - ol / (n1 + n2)
#'
#'   matrix(c(jd, rep(N, length(jd))), ncol = 2)
#' }
#'
#' #' Compute jaccard similarity values based on GenomicRanges overlaps in parallel.
#' #'
#' #' @param fragment_list The list object containing GenomicRanges objects.
#' #' @param sample_names Sample names. If NULL, will use BAM file names.
#' #' @param n_cores The number of cores to use in parallel. Use "auto" to detect and use all cores. Default is 6.
#' #' @param cluster_type EXPERIMENTAL: Either "PSOCK" (for Windows) or "FORK" (possible on Linux and Mac). FORK is more memory-efficient.
#' #'
#' #' @return a list of GenomicRanges objects
#' #'
#' run_fragment_overlap_jaccard_parallel3 <- function(fragment_list,
#'                                                    n_cores = 6,
#'                                                    cluster_type = "PSOCK") {
#'
#'   fragment_list <- GRangesList(fragment_list)
#'
#'   #index_pairs <- t(combn(1:length(fragment_list),2))
#'   fragment_lengths <- unlist(lapply(fragment_list, length))
#'
#'   # Set up parallelization
#'   if(n_cores == "auto") {
#'     n_cores <- parallel::detectCores()
#'   }
#'
#'   print(paste("Starting",n_cores,"nodes"))
#'
#'   cl <- parallel::makeCluster(n_cores, cluster_type)
#'
#'   print("Exporting necessary objects to nodes")
#'
#'   parallel::clusterEvalQ(cl, library(GenomicRanges))
#'   parallel::clusterExport(cl, c("fragment_list",
#'                                 "fragment_lengths",
#'                                 "fragment_overlap_jaccard_parallel"),
#'                           # Use the function's local environment for export
#'                           envir = environment())
#'
#'   N <- length(fragment_list) - 1
#'
#'   print(paste("Running",N,"comparisons."))
#'
#'   res <- clusterApplyLB_chunks(N = N,
#'                                n_chunks = 20,
#'                                cl = cl,
#'                                FUN = fragment_overlap_jaccard_parallel3)
#'
#'   stopCluster(cl)
#'
#'   print("Collecting results.")
#'
#'   res <- do.call("rbind", res)
#'   res <- res[,1][order(res[,2])]
#'
#'   res_mat <- matrix(0, length(fragment_list), length(fragment_list))
#'   res_mat[lower.tri(res_mat, diag = FALSE)] <- res
#'   res_mat <- t(res_mat)
#'   res_mat[lower.tri(res_mat, diag = FALSE)] <- res
#'
#'   return(res_mat)
#' }
AllenInstitute/lowcat documentation built on Oct. 30, 2019, 4:45 a.m.