#' set path for reference genome object
#'
#' This method defines the path where reference genome is stored
#'
#' @param reference.file is the location of the fasta format sequence file for
#' reference genome
#' @return None
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' # and check that something has been stored ...
#' getReferenceGenome()
#'
#' @export
setReferenceGenome <- function(reference.file) {
assign("reference.file", reference.file, envir = get(getEnvironment()))
}
#' get path reference genome object
#'
#' This method returns the stored path for the reference genome object
#'
#' @return character representation of path to stored fasta format file object
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' # and check that something has been stored ...
#' getReferenceGenome()
#'
#' @export
getReferenceGenome <- function() {
return(get("reference.file", envir = get(getEnvironment())))
}
#' load reference genome into memory
#'
#' This method will parse the defined reference.file into a DNAStringSet object
#' for handling in memory
#'
#' @importFrom Biostrings readDNAStringSet
#' @return None
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' loadReferenceGenome()
#'
#' @export
loadReferenceGenome <- function() {
# derive a referenceGenome object from the named fasta elements in the
# provided fasta reference
# resource
referenceGenomeSequence <- readDNAStringSet(getReferenceGenome())
referenceGenome <- data.frame(id = gsub(" .+", "",
names(referenceGenomeSequence)), sid = seq_along(
names(referenceGenomeSequence)), stringsAsFactors = FALSE)
referenceGenome$sid <- seq(nrow(referenceGenome))
assign("referenceGenome", referenceGenome, envir = get(getEnvironment()))
assign("referenceGenomeSequence", referenceGenomeSequence, envir =
get(getEnvironment()))
}
#' cleanup the reference genome data
#'
#' This method will remove unwanted chromosomes from the reference genome
#' collection; useful for when analyses are to
#' be linked to a set of core autosomes, for example.
#'
#' @param delIds is a vector of chromosomes to be dropped
#' @return None
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getChromosomeIds()
#' cleanReferenceGenome(getChromosomeIds()[1])
#' getChromosomeIds()
#'
#' @export
cleanReferenceGenome <- function(delIds) {
referenceGenome <- get("referenceGenome", envir = get(getEnvironment()))
overlap <- match(delIds, referenceGenome[, 1])
if (TRUE %in% is.na(overlap)) {
overlap <- overlap[-is.na(overlap)]
}
if (length(overlap) > 0) {
referenceGenome <- referenceGenome[-overlap, ]
assign("referenceGenome", referenceGenome, envir=get(getEnvironment()))
}
invisible()
}
#' accessory method for mapping named chromosomes to their pointers in the
#' reference fasta
#'
#' This method will return the numerical index for the named chromosome within
#' the referenceGenomeSequence
#'
#' @param chrId is the identifier for the chromosomes to be dropped
#' @return None
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getStringSetId(getChromosomeIds())
#' getStringSetId("Escherichia_coli_chromosome")
#'
#' @export
getStringSetId <- function(chrId) {
if (!(exists("referenceGenome", envir = get(getEnvironment())) &
exists("referenceGenomeSequence",
envir = get(getEnvironment())))) {
loadReferenceGenome()
}
referenceGenome <- get("referenceGenome", envir = get(getEnvironment()))
return(referenceGenome[match(as.character(chrId),
as.character(referenceGenome[, 1])), "sid"])
}
#' returns a DNAStringSet object corresponding to specified chromosome from
#' reference genome
#'
#' This method will return a DNAStringSet object from the reference genome;
#' requires a numeric pointer to the object
#'
#' @param dnaStringSetId is the pointer to use
#' @return DNAStringSet corresponding to pointer provided
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getChromosomeSequence(getStringSetId("Escherichia_coli_chromosome"))
#'
#' @seealso [getStringSetId()] for method to prepare numeric pointer
#' @export
getChromosomeSequence <- function(dnaStringSetId) {
if (!(exists("referenceGenome", envir = get(getEnvironment())) &
exists("referenceGenomeSequence",
envir = get(getEnvironment())))) {
loadReferenceGenome()
}
referenceGenomeSequence <- get("referenceGenomeSequence", envir =
get(getEnvironment()))
return(referenceGenomeSequence[[dnaStringSetId]])
}
#' returns a GRanges object corresponding to the currently loaded refgenome
#'
#' This method will return a DNAStringSet object from the reference genome;
#' requires a numeric pointer to the object
#'
#' @return DNAStringSet corresponding to pointer provided
#'
#' @examples
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getReferenceGenomeGR()
#'
#' @export
getReferenceGenomeGR <- function() {
if (!(exists("referenceGenome", envir = get(getEnvironment())) &&
exists("referenceGenomeSequence", envir = get(getEnvironment())))) {
loadReferenceGenome()
}
referenceGenome <- get("referenceGenome", envir = get(getEnvironment()))
referenceGenomeSequence <- get("referenceGenomeSequence", envir =
get(getEnvironment()))
return(GRanges(seqnames=referenceGenome[,1],
IRanges(start=1, end=width(referenceGenomeSequence)[
referenceGenome[,2]])))
}
#' get chromosome identifiers from reference genome
#'
#' This method will return a vector of chromsome identifiers
#'
#' @return vector of names
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getChromosomeIds()
#'
#' @export
getChromosomeIds <- function() {
if (!(exists("referenceGenome", envir = get(getEnvironment())) &
exists("referenceGenomeSequence",
envir = get(getEnvironment())))) {
loadReferenceGenome()
}
return(get("referenceGenome", envir = get(getEnvironment()))[, 1])
}
#' get chromosome lengths for a named set of chromosomes
#'
#' This method will return a vector of named chromosome lengths
#'
#' @param x vector of chromosome ids
#' @return vector of names
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getSeqLengths(getChromosomeIds())
#'
#' @export
getSeqLengths <- function(x) {
if (!(exists("referenceGenome", envir = get(getEnvironment())) &&
exists("referenceGenomeSequence",
envir = get(getEnvironment())))) {
loadReferenceGenome()
}
keys <- gsub("\\s.+", "", names(get("referenceGenomeSequence", envir =
get(getEnvironment()))))
wdata <- width(get("referenceGenomeSequence", envir =
get(getEnvironment())))[match(x, keys)]
names(wdata) <- x
return(wdata)
}
#' prepare mapping summary information by chromosome
#'
#' This method will return a data.frame of basic per chromosome mapping
#' statistics
#'
#' @importFrom gtools mixedsort
#' @importFrom scales comma
#' @importFrom Biostrings letterFrequency
#' @import magrittr
#' @importFrom S4Vectors mcols
#' @import GenomeInfoDb
#' @importFrom GenomicRanges tileGenome
#' @importFrom GenomicRanges binnedAverage
#' @importFrom GenomicRanges coverage
#' @importFrom IRanges IRanges
#' @importFrom GenomicRanges GRanges
#' @importFrom dplyr filter
#' @importFrom scales comma_format
#' @importFrom rlang .data
#' @importFrom gtools mixedsort
#' @param chrIds vector of chromosome ids
#' @param bamFile to the bamFile used for analysis
#' @param flag to define whether mapping is reported at the Primary level
#' @return data.frame of mapping characteristics
#'
#' @examples
#' init()
#' demoBam <- system.file("extdata",
#' "Ecoli_zymo_R10_filt_subs.bam",
#' package = "nanopoRe")
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' chromosomeMappingSummary(getChromosomeIds(), demoBam)
#'
#' @export
chromosomeMappingSummary <- function(chrIds, bamFile, flag = "Primary") {
bamSummary <- bamSummarise(bamFile, blockSize = 10000)
getChrData <- function(id, bamSummary, flag = "Primary") {
dna <- getChromosomeSequence(getStringSetId(id))
letterFreq <- letterFrequency(dna, c("A", "C", "G", "T", "N"))
mapChr <- bamSummary %>%
filter(.data$readFlag == flag & .data$rname == id)
# depending on the genome used there may be a load of warnings here
# this is likely due to reads mapping beyond segment boundaries -
# warnings are masked here since they are expected
suppressWarnings(gr <- GRanges(seqnames = mapChr$rname, ranges =
IRanges(start = mapChr$start, end = mapChr$end), strand =
mapChr$strand, seqlengths = getSeqLengths(levels(mapChr$rname))))
mc <- coverage(gr)
bi <- tileGenome(seqlengths(gr), ntile = 1)
cd <- binnedAverage(bi[[1]], mc, "binned_cov")
ke <- which(as.character(seqnames(cd)) == id)
co <- mcols(cd[ke, ])$binned_cov
return(c(chrId = id, chrLength =(scales::comma_format())(length(dna)),
`N (%)` =paste0(round(letterFreq["N"]/length(dna) * 100, digits=2)),
`GC (%)`=paste0(round((letterFreq["G"]+letterFreq["G"])/length(dna)
*100, digits = 2)), `Mapped Reads` = (scales::comma_format())
(nrow(mapChr)), `Mapped Bases` = (scales::comma_format())(sum(
mapChr$coverage *mapChr$qwidth)), `Mean Coverage` = paste0(
round(co, digits = 2))))
}
chromosomeData <- data.frame(t(as.data.frame(lapply(gtools::mixedsort(
unique(chrIds)), getChrData, bamSummary = bamSummary, flag = flag))),
stringsAsFactors = FALSE, row.names = NULL)
return(chromosomeData)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.