#load BED file for ChIP-seq data into a dataframe
createChipBedFrame <- function(bedPath) {
bindSites <- read.table(bedPath,
sep = "\t",
header = FALSE)
if (nrow(bindSites) == 0) {
warning("Empty BED file specified")
}
colnames(bindSites) <- c("chrom", "start", "end", "name",
"score", "strand")
return(bindSites)
}
# get the DNA sequence in the vicinity of each ChIP-seq peak
getSurroundingSeq <- function(bedFrame, seqWidth, assembly = "hg19") {
#make a genomic ranges object from ChIP data table
bedRange <- GenomicRanges::makeGRangesFromDataFrame(bedFrame,
keep.extra.columns = FALSE)
#resize the genomic ranges
bindSitesResized <- GenomicRanges::resize(bedRange,
width = seqWidth,
fix='center')
#lead sequencs in Granges from the specified assembly
if (assembly == "hg19") {
seq <- Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg19,
bindSitesResized)
} else if (assembly == "hg38") {
seq <- Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,
bindSitesResized)
} else if (assembly == "mm9") {
seq <- Biostrings::getSeq(BSgenome.Mmusculus.UCSC.mm9,
bindSitesResized)
} else if (assembly == "mm10") {
seq <- Biostrings::getSeq(BSgenome.Mmusculus.UCSC.mm10,
bindSitesResized)
}
return(seq)
}
#'Generate Reports on Potential G-quadruplex forming Strings
#'near ChIP peaks using pqsfinder
#'
#'Searches for G-quadruplexes in DNA sequences around the ChIP peaks from the BED6 or
#'greater file specified throught the function argument bedPath, and returns a list
#'of reports for each sequence generated by the pqsfinder package.
#'
#'@param bedPath the path to a BED6 or greater file containing processed
#' ChIP-seq peaks. If data is not stranded, a period should be specified
#' in the strand column.
#'@param seqWidth the width of the sequence around the centre of each peak that
#' should be searched for quadruplexes.
#'@param assemblyVersion the genome assembly to retrive sequences from.
#' Must be one of "hg19", "hg38", "mm9" or "mm10"
#'
#'@return returns a list of reports on potential quadruplex forming strings
#' generated by pqsfinder::pqsfinder().
#'
#'@examples
#' path <- system.file("extdata", "MAZ_very_small_test.bed", package = "ChIPAnalyzer")
#' findQuads(bedPath = path, seqWidth = 200, assemblyVersion = "hg19")
#'
#'@references
#' Hon J, Martinek T, Zendulka J, Lexa M. (2017) pqsfinder: an
#' exhaustive and imperfection-tolerant search tool for
#' potential quadruplex-forming sequences in R. Bioinformatics.
#' 33(21), 3373-3379.
#' https://doi.org/10.1093/bioinformatics/btx413
#'
#' The Bioconductor Dev Team (2014).
#' BSgenome.Hsapiens.UCSC.hg19: Full genome sequences for Homo
#' sapiens (UCSC version hg19). R package version 1.4.0.
#'
#' The Bioconductor Dev Team (2015).
#' BSgenome.Hsapiens.UCSC.hg38: Full genome sequences for Homo
#' sapiens (UCSC version hg38). R package version 1.4.1.
#'
#' The Bioconductor Dev Team (2014).
#' BSgenome.Mmusculus.UCSC.mm9: Full genome sequences for Mus
#' musculus (UCSC version mm9). R package version 1.4.0.
#'
#' The Bioconductor Dev Team (2014).
#' BSgenome.Mmusculus.UCSC.mm10: Full genome sequences for Mus
#' musculus (UCSC version mm10). R package version 1.4.0.
#'
#' Lawrence M, Huber W, Pag\`es H, Aboyoun P, Carlson M, et al.
#' (2013) Software for Computing and Annotating Genomic Ranges.
#' PLoS Comput Biol 9(8): e1003118.
#' doi:10.1371/journal.pcbi.1003118
#'
#' H. Pagès, P. Aboyoun, R. Gentleman and S. DebRoy (2019).
#' Biostrings: Efficient manipulation of biological strings. R
#' package version 2.54.0.
#'
#'@export
#'@importFrom BSgenome.Hsapiens.UCSC.hg19 BSgenome.Hsapiens.UCSC.hg19
#'@importFrom BSgenome.Hsapiens.UCSC.hg38 BSgenome.Hsapiens.UCSC.hg38
#'@importFrom BSgenome.Mmusculus.UCSC.mm9 BSgenome.Mmusculus.UCSC.mm9
#'@importFrom BSgenome.Mmusculus.UCSC.mm10 BSgenome.Mmusculus.UCSC.mm10
#'@importFrom Biostrings reverseComplement
#'@importFrom pqsfinder pqsfinder
#'@importFrom pqsfinder PQSViews
#'@import GenomicRanges
findQuads <- function (bedPath, seqWidth, assemblyVersion = "hg19") {
supportedAssemblies <- c("hg19", "hg38", "mm9", "mm10")
#check that the assembly specified is valid
if (!assemblyVersion %in% supportedAssemblies) {
stop("Assembly specified not supported")
}
if (seqWidth < 1) {
stop("seqWidth must be at least 1")
}
if (seqWidth > 1500) {
warning("large seqWidth values may lead to errors if sequence exceeds
chromosome bounds")
}
#load the ChIP peaks from the BED file into a dataframe
bindSiteFrame <- createChipBedFrame(bedPath = bedPath)
# get the sequences around the centre of the ChIP-peak
testSeqs <- getSurroundingSeq(bindSiteFrame,
seqWidth = seqWidth,
assembly = assemblyVersion)
#run pqsfinder on the sequences
library("pqsfinder")
quadReports <- lapply(testSeqs, pqsfinder::pqsfinder)
return(quadReports)
}
#get only the maximum scoring quadruplexes from each string
getMaxOnly <- function(prelimScores, prelimStarts, prelimWidths) {
if (length(prelimScores) > 0) {
maxScoreInd <- which(prelimScores == max(prelimScores))[[1]]
maxScoreVar <- prelimScores[maxScoreInd]
maxStartVar <- prelimStarts[maxScoreInd]
maxWidthVar <- prelimWidths[maxScoreInd]
} else {
maxScoreVar <- 0
maxStartVar <- 0
maxWidthVar <- 0
}
maxQuad <- c(maxScoreVar, maxStartVar, maxWidthVar)
names(maxQuad) <- c("score", "start", "width")
return(maxQuad)
}
#format the data for conversion into a binary vector
formatQuadData <- function(prelimScores, prelimStarts, prelimWidths) {
if (length(prelimScores) > 0) {
quad <- list(prelimScores, prelimStarts, prelimWidths)
names(quad) <- c("score", "start", "width")
} else {
scoreVar <- 0
startVar <- 0
widthVar <- 0
quad <- list(scoreVar, startVar, widthVar)
names(quad) <- c("score", "start", "width")
}
return(quad)
}
#get the score, start position, and width of each quadruplex
getBasicStats <- function(quadResults, seqWidth, maxOnly) {
#get the scores, start postitions, and widths of any quadruplexes
quadScores <- lapply(quadResults, score)
quadStart <- lapply(quadResults, start)
quadWidth <- lapply(quadResults, width)
prelimBasic <- list(quadScores, quadStart, quadWidth)
names(prelimBasic) <- c("scores", "starts", "widths")
#filter out stats from non-maximal G-quadruplexes
if (maxOnly == TRUE) {
quadBasic <- mapply(getMaxOnly, prelimBasic$scores,
prelimBasic$start, prelimBasic$width,
SIMPLIFY = FALSE)
} else {
quadBasic <- mapply(formatQuadData, prelimBasic$scores,
prelimBasic$start, prelimBasic$width,
SIMPLIFY = FALSE)
}
return(quadBasic)
}
#return a binary vector of length equal to the length of the
#sequence that was searched for quadruplexes, with ones at positions
#containing a quadruplex and zeroes at other positions
getBinVector <- function(quadData, seqWidth) {
binVec <- rep(0, seqWidth)
if (quadData[["start"]][1] != 0) {
for (i in 1:length(quadData[["start"]])) {
binVec[quadData[["start"]][i]:(quadData[["start"]][i] + quadData[["width"]][i] - 1)] <- 1
}
}
return(binVec)
}
#'Get A Matrix Representing the positions of G-quadruplexes in Multiple Sequences
#'
#'Create a matrix with a number of rows equivalent to the number of reports in
#'quadReports, and a number of columns equal to seqWidth, where each entry is 1 iff
#'there is a quadruplex at that position in the sequence for the report for that row,
#'and a 0 otherwise.
#'
#'@param quadReports a list of reports (type PQSviews) generated by pqsfinder in
#' the function findQuads. The length of the sequences reported on must be the same
#' and equal to seqWidth
#'
#'@param maxOnly if TRUE, only the maximum scoring G-quadruplexes for each sequence are used.
#'
#'
#'@return a matrix with dimensions length(quadReports) X seqWidth
#'
#'@examples
#' path <- system.file("extdata", "MAZ_very_small_test.bed", package = "ChIPAnalyzer")
#' reports <- findQuads(bedPath = path, seqWidth = 200, assemblyVersion = "hg19")
#' quadMatrix <- getQuadMatrix(quadReports = reports, maxOnly = TRUE)
#'
#'@references
#' Hon J, Martinek T, Zendulka J, Lexa M. (2017) pqsfinder: an
#' exhaustive and imperfection-tolerant search tool for
#' potential quadruplex-forming sequences in R. Bioinformatics.
#' 33(21), 3373-3379.
#' https://doi.org/10.1093/bioinformatics/btx413
#'
#'@export
#'@import pqsfinder
getQuadMatrix <- function(quadReports, maxOnly = TRUE) {
seq1Length <- nchar(as.character(Biostrings::subject(quadReports[[1]])))
seq2Length <- nchar(as.character(Biostrings::subject(quadReports[[2]])))
if (seq1Length != seq2Length) {
stop("Reports not for sequences of equal length")
}
seqWidth = seq1Length
prelimStats <- getBasicStats(quadResults = quadReports,
seqWidth = seqWidth,
maxOnly = maxOnly)
binVecs <- lapply(prelimStats, getBinVector,
seqWidth = seqWidth)
binMatrix <- matrix(unlist(binVecs),
ncol = seqWidth,
byrow = TRUE)
return(binMatrix)
}
#'Get the percentage abundance of quadruplexes at positons across multiple
#'sequences of equal length
#'
#'Using the matrix returned by getQuadMatrix, calculate the percentage abundance
#'quadruplexes at positions dictated by the columns of quadMatrix
#'
#'@param quadMatrix a matrix in that format returned from getQuadMatrix,
#' with each row representing a different sequence a columns representing
#' positions, with a 1 indicated presence of a quadruplex and 0 otherwise
#'
#' @return a vector of percentages indicating the percentage of rows in
#' quadmatrix that there was a 1(quadruplex) at the position represented
#' by that column
#'
#'@examples
#' path <- system.file("extdata", "MAZ_very_small_test.bed", package = "ChIPAnalyzer")
#' reports <- findQuads(bedPath = path,
#' seqWidth = 200,
#' assemblyVersion = "hg19")
#' qMatrix <- getQuadMatrix(quadReports = reports, maxOnly = TRUE)
#' quadCoveragePercentage <-
#' getQuadCoveragePercentage(quadMatrix = qMatrix)
#'
#'@export
getQuadCoveragePercentage <- function (quadMatrix) {
sums <- colSums(quadMatrix)
percentage <- (sums / nrow(quadMatrix)) * 100
return(percentage)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.