ChIPanalyzer: ChIPAnalyzer

Documented in getQuadMatrix

#load BED file for ChIP-seq data into a dataframe
createChipBedFrame <- function(bedPath) {
  bindSites <- read.table(bedPath,
                          sep = "\t",
                          header = FALSE)
  if (nrow(bindSites) == 0) {
    warning("Empty BED file specified")
  }
  colnames(bindSites) <- c("chrom", "start", "end", "name",
                         "score", "strand")
  return(bindSites)
}

# get the DNA sequence in the vicinity of each ChIP-seq peak
getSurroundingSeq <- function(bedFrame, seqWidth, assembly = "hg19") {
  #make a genomic ranges object from ChIP data table
  bedRange <- GenomicRanges::makeGRangesFromDataFrame(bedFrame,
                                                      keep.extra.columns = FALSE)
  #resize the genomic ranges
  bindSitesResized <- GenomicRanges::resize(bedRange,
                                            width = seqWidth,
                                            fix='center')
  #lead sequencs in Granges from the specified assembly
  if (assembly == "hg19") {
    seq <- Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg19,
                            bindSitesResized)
  } else if (assembly == "hg38") {
    seq <- Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,
                              bindSitesResized)
  } else if (assembly == "mm9") {
    seq <- Biostrings::getSeq(BSgenome.Mmusculus.UCSC.mm9,
                              bindSitesResized)
  } else if (assembly == "mm10") {
    seq <- Biostrings::getSeq(BSgenome.Mmusculus.UCSC.mm10,
                              bindSitesResized)
  }
  return(seq)
}

#'Generate Reports on Potential G-quadruplex forming Strings
#'near ChIP peaks using pqsfinder
#'
#'Searches for G-quadruplexes in DNA sequences around the ChIP peaks from the BED6 or
#'greater file specified throught the function argument bedPath, and returns a list
#'of reports for each sequence generated by the pqsfinder package.
#'
#'@param bedPath the path to a BED6 or greater file containing processed
#' ChIP-seq peaks. If data is not stranded, a period should be specified
#' in the strand column.
#'@param seqWidth the width of the sequence around the centre of each peak that
#' should be searched for quadruplexes.
#'@param assemblyVersion the genome assembly to retrive sequences from.
#' Must be one of "hg19", "hg38", "mm9" or "mm10"
#'
#'@return returns a list of reports on potential quadruplex forming strings
#' generated by pqsfinder::pqsfinder().
#'
#'@examples
#' path <- system.file("extdata", "MAZ_very_small_test.bed", package = "ChIPAnalyzer")
#' findQuads(bedPath = path, seqWidth = 200, assemblyVersion = "hg19")
#'
#'@references
#' Hon J, Martinek T, Zendulka J, Lexa M. (2017) pqsfinder: an
#' exhaustive and imperfection-tolerant search tool for
#' potential quadruplex-forming sequences in R. Bioinformatics.
#' 33(21), 3373-3379.
#' https://doi.org/10.1093/bioinformatics/btx413
#'
#' The Bioconductor Dev Team (2014).
#' BSgenome.Hsapiens.UCSC.hg19: Full genome sequences for Homo
#' sapiens (UCSC version hg19). R package version 1.4.0.
#'
#' The Bioconductor Dev Team (2015).
#' BSgenome.Hsapiens.UCSC.hg38: Full genome sequences for Homo
#' sapiens (UCSC version hg38). R package version 1.4.1.
#'
#' The Bioconductor Dev Team (2014).
#' BSgenome.Mmusculus.UCSC.mm9: Full genome sequences for Mus
#' musculus (UCSC version mm9). R package version 1.4.0.
#'
#' The Bioconductor Dev Team (2014).
#' BSgenome.Mmusculus.UCSC.mm10: Full genome sequences for Mus
#' musculus (UCSC version mm10). R package version 1.4.0.
#'
#' Lawrence M, Huber W, Pag\`es H, Aboyoun P, Carlson M, et al.
#' (2013) Software for Computing and Annotating Genomic Ranges.
#' PLoS Comput Biol 9(8): e1003118.
#' doi:10.1371/journal.pcbi.1003118
#'
#' H. Pagès, P. Aboyoun, R. Gentleman and S. DebRoy (2019).
#' Biostrings: Efficient manipulation of biological strings. R
#' package version 2.54.0.
#'
#'@export
#'@importFrom BSgenome.Hsapiens.UCSC.hg19 BSgenome.Hsapiens.UCSC.hg19
#'@importFrom BSgenome.Hsapiens.UCSC.hg38 BSgenome.Hsapiens.UCSC.hg38
#'@importFrom BSgenome.Mmusculus.UCSC.mm9 BSgenome.Mmusculus.UCSC.mm9
#'@importFrom BSgenome.Mmusculus.UCSC.mm10 BSgenome.Mmusculus.UCSC.mm10
#'@importFrom Biostrings reverseComplement
#'@importFrom pqsfinder pqsfinder
#'@importFrom pqsfinder PQSViews
#'@import GenomicRanges
findQuads <- function (bedPath, seqWidth, assemblyVersion = "hg19") {
  supportedAssemblies <- c("hg19", "hg38", "mm9", "mm10")
  #check that the assembly specified is valid
  if (!assemblyVersion %in% supportedAssemblies) {
    stop("Assembly specified not supported")
  }
  if (seqWidth < 1) {
    stop("seqWidth must be at least 1")
  }
  if (seqWidth > 1500) {
    warning("large seqWidth values may lead to errors if sequence exceeds
            chromosome bounds")
  }
  #load the ChIP peaks from the BED file into a dataframe
  bindSiteFrame <- createChipBedFrame(bedPath = bedPath)
  # get the sequences around the centre of the ChIP-peak
  testSeqs <- getSurroundingSeq(bindSiteFrame,
                                seqWidth = seqWidth,
                                assembly = assemblyVersion)
  #run pqsfinder on the sequences
  library("pqsfinder")
  quadReports <- lapply(testSeqs, pqsfinder::pqsfinder)
  return(quadReports)
}

#get only the maximum scoring quadruplexes from each string
getMaxOnly <- function(prelimScores, prelimStarts, prelimWidths) {
  if (length(prelimScores) > 0) {
    maxScoreInd <- which(prelimScores == max(prelimScores))[[1]]
    maxScoreVar <- prelimScores[maxScoreInd]
    maxStartVar <- prelimStarts[maxScoreInd]
    maxWidthVar <- prelimWidths[maxScoreInd]
  } else {
    maxScoreVar <- 0
    maxStartVar <- 0
    maxWidthVar <- 0
  }
  maxQuad <- c(maxScoreVar, maxStartVar, maxWidthVar)
  names(maxQuad) <- c("score", "start", "width")
  return(maxQuad)
}

#format the data for conversion into a binary vector
formatQuadData <- function(prelimScores, prelimStarts, prelimWidths) {
  if (length(prelimScores) > 0) {
    quad <- list(prelimScores, prelimStarts, prelimWidths)
    names(quad) <- c("score", "start", "width")
  } else {
    scoreVar <- 0
    startVar <- 0
    widthVar <- 0
    quad <- list(scoreVar, startVar, widthVar)
    names(quad) <- c("score", "start", "width")
  }
  return(quad)
}


#get the score, start position, and width of each quadruplex
getBasicStats <- function(quadResults, seqWidth, maxOnly) {
  #get the scores, start postitions, and widths of any quadruplexes
  quadScores <- lapply(quadResults, score)
  quadStart <- lapply(quadResults, start)
  quadWidth <- lapply(quadResults, width)
  prelimBasic <- list(quadScores, quadStart, quadWidth)
  names(prelimBasic) <- c("scores", "starts", "widths")
  #filter out stats from non-maximal G-quadruplexes
  if (maxOnly == TRUE) {
    quadBasic <- mapply(getMaxOnly, prelimBasic$scores,
                      prelimBasic$start, prelimBasic$width,
                      SIMPLIFY = FALSE)
  } else {
    quadBasic <- mapply(formatQuadData, prelimBasic$scores,
                        prelimBasic$start, prelimBasic$width,
                        SIMPLIFY = FALSE)
  }

  return(quadBasic)
}

#return a binary vector of length equal to the length of the
#sequence that was searched for quadruplexes, with ones at positions
#containing a quadruplex and zeroes at other positions
getBinVector <- function(quadData, seqWidth) {
  binVec <- rep(0, seqWidth)
  if (quadData[["start"]][1] != 0) {
    for (i in 1:length(quadData[["start"]])) {
      binVec[quadData[["start"]][i]:(quadData[["start"]][i] + quadData[["width"]][i] - 1)] <- 1
    }
  }
  return(binVec)
}

#'Get A Matrix Representing the positions of G-quadruplexes in Multiple Sequences
#'
#'Create a matrix with a number of rows equivalent to the number of reports in
#'quadReports, and a number of columns equal to seqWidth, where each entry is 1 iff
#'there is a quadruplex at that position in the sequence for the report for that row,
#'and a 0 otherwise.
#'
#'@param quadReports a list of reports (type PQSviews) generated by pqsfinder in
#' the function findQuads. The length of the sequences reported on must be the same
#' and equal to seqWidth
#'
#'@param maxOnly if TRUE, only the maximum scoring G-quadruplexes for each sequence are used.
#'
#'
#'@return a matrix with dimensions length(quadReports) X seqWidth
#'
#'@examples
#' path <- system.file("extdata", "MAZ_very_small_test.bed", package = "ChIPAnalyzer")
#' reports <- findQuads(bedPath = path, seqWidth = 200, assemblyVersion = "hg19")
#' quadMatrix <- getQuadMatrix(quadReports = reports, maxOnly = TRUE)
#'
#'@references
#' Hon J, Martinek T, Zendulka J, Lexa M. (2017) pqsfinder: an
#' exhaustive and imperfection-tolerant search tool for
#' potential quadruplex-forming sequences in R. Bioinformatics.
#' 33(21), 3373-3379.
#' https://doi.org/10.1093/bioinformatics/btx413
#'
#'@export
#'@import pqsfinder
getQuadMatrix <- function(quadReports, maxOnly = TRUE) {
  seq1Length <- nchar(as.character(Biostrings::subject(quadReports[[1]])))
  seq2Length <- nchar(as.character(Biostrings::subject(quadReports[[2]])))
  if (seq1Length != seq2Length) {
    stop("Reports not for sequences of equal length")
  }
  seqWidth = seq1Length
  prelimStats <- getBasicStats(quadResults = quadReports,
                               seqWidth = seqWidth,
                               maxOnly = maxOnly)
  binVecs <- lapply(prelimStats, getBinVector,
                    seqWidth = seqWidth)
  binMatrix <- matrix(unlist(binVecs),
                      ncol = seqWidth,
                      byrow = TRUE)
  return(binMatrix)
}


#'Get the percentage abundance of quadruplexes at positons across multiple
#'sequences of equal length
#'
#'Using the matrix returned by getQuadMatrix, calculate the percentage abundance
#'quadruplexes at positions dictated by the columns of quadMatrix
#'
#'@param quadMatrix a matrix in that format returned from getQuadMatrix,
#' with each row representing a different sequence a columns representing
#' positions, with a 1 indicated presence of a quadruplex and 0 otherwise
#'
#' @return a vector of percentages indicating the percentage of rows in
#' quadmatrix that there was a 1(quadruplex) at the position represented
#' by that column
#'
#'@examples
#' path <- system.file("extdata", "MAZ_very_small_test.bed", package = "ChIPAnalyzer")
#' reports <- findQuads(bedPath = path,
#'  seqWidth = 200,
#'  assemblyVersion = "hg19")
#' qMatrix <- getQuadMatrix(quadReports = reports, maxOnly = TRUE)
#' quadCoveragePercentage <-
#'  getQuadCoveragePercentage(quadMatrix = qMatrix)
#'
#'@export
getQuadCoveragePercentage <- function (quadMatrix) {
  sums <- colSums(quadMatrix)
  percentage <- (sums / nrow(quadMatrix)) * 100
  return(percentage)
}