R/generate.peaks.from.gmm.R

Defines functions .generate.peaks.from.gmm

Documented in .generate.peaks.from.gmm

#' generate.peaks.from.gmm is a function that takes the output of a GMM and generates peaks
#'
#' @param dp_data Results from fitting a Dirichlet Process GMM model
#' @param PARAMETERS A PARAMETERS list with the parameters indicated in the DPDE4PM function
#' @param GENEINFO A list of gene attributes generated by the get.gene.anno function
#'
#' @return
#' \describe{
#'  \item{merged.peaks.filtered.rna}{A merged peaks GenomicRanges object in RNA coordinates}
#'  \item{merged.peaks.filtered.genome}{A merged peaks GenomicRanges object in genomic coordinates}
#' }
.generate.peaks.from.gmm = function(
  dp_data,
  PARAMETERS,
  GENEINFO
){

  # Filtering by Threshold
  dp_data = dp_data[dp_data$Weights > PARAMETERS$WEIGHT.THRESHOLD,]
  dp_data = dp_data[complete.cases(dp_data),]
  if(nrow(dp_data) == 0){
    warning("No Peaks Survive Past The Weight Threshold or DP could not fit reasonable Gaussians. Consider Lowering Requirements or Tuning Parameters.",
            call. = TRUE, domain = NULL)
    return(list(GenomicRanges::GRanges(),  GenomicRanges::GRanges()))
  }
  dp_data = dp_data[order(-dp_data$Weights),]

  # Creating Peaks
  merged.peaks = data.frame(
    "chr" = GENEINFO$chr,
    "start" = round(dp_data$Mu - PARAMETERS$N.SD*dp_data$Sigma),
    "end" = round(dp_data$Mu + PARAMETERS$N.SD*dp_data$Sigma),
    "name" = GENEINFO$gene,
    "strand" = GENEINFO$strand,
    "weights" = dp_data$Weights,
    "i" = dp_data$i,
    "j" = seq(1, nrow(dp_data), 1),
    stringsAsFactors = F
  )
  merged.peaks$start = ifelse(merged.peaks$start < 1, 1, merged.peaks$start)
  merged.peaks$end = ifelse(merged.peaks$end > GENEINFO$exome_length,  GENEINFO$exome_length, merged.peaks$end)
  merged.peaks = merged.peaks[!duplicated(merged.peaks[,c("chr", "start", "end", "name", "strand")]),]

  # Filtering peaks where 1 peak is within the other peak
  merged.peaks.gr =  GenomicRanges::makeGRangesFromDataFrame(merged.peaks, keep.extra.columns = T)
  within.peaks = GenomicRanges::findOverlaps(merged.peaks.gr, merged.peaks.gr, type = "within")
  within.peaks = within.peaks[S4Vectors::subjectHits(within.peaks) != S4Vectors::queryHits(within.peaks)]
  if(length(within.peaks) > 0){
    wid = IRanges::width(merged.peaks.gr)
    remove.elements = ifelse(wid[S4Vectors::queryHits(within.peaks)] > wid[S4Vectors::subjectHits(within.peaks)], S4Vectors::subjectHits(within.peaks), S4Vectors::queryHits(within.peaks))
    merged.peaks.gr = merged.peaks.gr[!1:length(merged.peaks.gr) %in% remove.elements]
  }

  # Subtracting Overlapping Regions
  merged.peaks.filtered.rna = GenomicRanges::GRanges()
  for ( i in 1:length(merged.peaks.gr)){
    if(i == 1){
      tmp.gr = merged.peaks.gr[1]
    }else{
      tmp.gr = GenomicRanges::setdiff(merged.peaks.gr[i], merged.peaks.gr[1:(i-1)])
      if(length(tmp.gr) > 0){S4Vectors::mcols(tmp.gr) = S4Vectors::mcols(merged.peaks.gr[i])}
    }
    merged.peaks.filtered.rna = c(merged.peaks.filtered.rna, tmp.gr)
  }

  # Removing peaks that are below resolution
  # wid = IRanges::width(merged.peaks.filtered.rna)
  # remove.elements = which(wid < PARAMETERS$RESOLUTION)
  # merged.peaks.filtered.rna = merged.peaks.filtered.rna[!1:length(merged.peaks.filtered.rna) %in% remove.elements]
  # if(length(merged.peaks.filtered.rna) == 0){
  #   warning("No Peaks Survive After The Width Filter",
  #           call. = TRUE, domain = NULL)
  #   return(list(GenomicRanges::GRanges(),  GenomicRanges::GRanges()))
  # }

  # Transferring to Genomic Coordinates
  merged.peaks.genome = merged.peaks.filtered.rna
  GenomicRanges::end(merged.peaks.genome) = GENEINFO$RNA2DNA[GenomicRanges::end(merged.peaks.genome)]
  GenomicRanges::start(merged.peaks.genome) = GENEINFO$RNA2DNA[GenomicRanges::start(merged.peaks.genome)]
  anno_gr = GenomicRanges::makeGRangesFromDataFrame(GENEINFO$anno)

  # Filtering Out Introns
  merged.peaks.filtered.genome = GenomicRanges::GRanges()
  for(i in 1:length(merged.peaks.genome)){
    tmp.gr = GenomicRanges::intersect(merged.peaks.genome[i], anno_gr)
    if(length(tmp.gr) > 0){S4Vectors::mcols(tmp.gr) = S4Vectors::mcols(merged.peaks.genome[i])}
    merged.peaks.filtered.genome = c(merged.peaks.filtered.genome, tmp.gr)
  }


  return(list(merged.peaks.filtered.rna, merged.peaks.filtered.genome))
}
helen-zhu/DPDE4PM documentation built on Feb. 17, 2021, 9:46 a.m.