nanopoRe: Accessory Methods for the Analysis of Oxford Nanopore Technologies DNA Sequence Data

Documented in chromosomeMappingSummary cleanReferenceGenome getChromosomeIds getChromosomeSequence getReferenceGenome getReferenceGenomeGR getSeqLengths getStringSetId loadReferenceGenome setReferenceGenome

#' set path for reference genome object
#'
#' This method defines the path where reference genome is stored
#'
#' @param reference.file is the location of the fasta format sequence file for
#' reference genome
#' @return None
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' # and check that something has been stored ...
#' getReferenceGenome()
#'
#' @export
setReferenceGenome <- function(reference.file) {
    assign("reference.file", reference.file, envir = get(getEnvironment()))
}

#' get path reference genome object
#'
#' This method returns the stored path for the reference genome object
#'
#' @return character representation of path to stored fasta format file object
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' # and check that something has been stored ...
#' getReferenceGenome()
#'
#' @export
getReferenceGenome <- function() {
    return(get("reference.file", envir = get(getEnvironment())))
}



#' load reference genome into memory
#'
#' This method will parse the defined reference.file into a DNAStringSet object
#' for handling in memory
#'
#' @importFrom Biostrings readDNAStringSet
#' @return None
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' loadReferenceGenome()
#'
#' @export
loadReferenceGenome <- function() {
    # derive a referenceGenome object from the named fasta elements in the
    # provided fasta reference
    # resource
    referenceGenomeSequence <- readDNAStringSet(getReferenceGenome())
    referenceGenome <- data.frame(id = gsub(" .+", "",
        names(referenceGenomeSequence)), sid = seq_along(
        names(referenceGenomeSequence)), stringsAsFactors = FALSE)
    referenceGenome$sid <- seq(nrow(referenceGenome))
    assign("referenceGenome", referenceGenome, envir = get(getEnvironment()))
    assign("referenceGenomeSequence", referenceGenomeSequence, envir =
        get(getEnvironment()))
}


#' cleanup the reference genome data
#'
#' This method will remove unwanted chromosomes from the reference genome
#' collection; useful for when analyses are to
#' be linked to a set of core autosomes, for example.
#'
#' @param delIds is a vector of chromosomes to be dropped
#' @return None
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getChromosomeIds()
#' cleanReferenceGenome(getChromosomeIds()[1])
#' getChromosomeIds()
#'
#' @export
cleanReferenceGenome <- function(delIds) {
    referenceGenome <- get("referenceGenome", envir = get(getEnvironment()))
    overlap <- match(delIds, referenceGenome[, 1])
    if (TRUE %in% is.na(overlap)) {
        overlap <- overlap[-is.na(overlap)]
    }
    if (length(overlap) > 0) {
        referenceGenome <- referenceGenome[-overlap, ]
        assign("referenceGenome", referenceGenome, envir=get(getEnvironment()))
    }
    invisible()
}


#' accessory method for mapping named chromosomes to their pointers in the
#' reference fasta
#'
#' This method will return the numerical index for the named chromosome within
#' the referenceGenomeSequence
#'
#' @param chrId is the identifier for the chromosomes to be dropped
#' @return None
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getStringSetId(getChromosomeIds())
#' getStringSetId("Escherichia_coli_chromosome")
#'
#' @export
getStringSetId <- function(chrId) {
    if (!(exists("referenceGenome", envir = get(getEnvironment())) &
        exists("referenceGenomeSequence",
        envir = get(getEnvironment())))) {
        loadReferenceGenome()
    }
    referenceGenome <- get("referenceGenome", envir = get(getEnvironment()))
    return(referenceGenome[match(as.character(chrId),
        as.character(referenceGenome[, 1])), "sid"])
}




#' returns a DNAStringSet object corresponding to specified chromosome from
#' reference genome
#'
#' This method will return a DNAStringSet object from the reference genome;
#' requires a numeric pointer to the object
#'
#' @param dnaStringSetId is the pointer to use
#' @return DNAStringSet corresponding to pointer provided
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getChromosomeSequence(getStringSetId("Escherichia_coli_chromosome"))
#'
#' @seealso [getStringSetId()] for method to prepare numeric pointer
#' @export
getChromosomeSequence <- function(dnaStringSetId) {
    if (!(exists("referenceGenome", envir = get(getEnvironment())) &
        exists("referenceGenomeSequence",
        envir = get(getEnvironment())))) {
        loadReferenceGenome()
    }
    referenceGenomeSequence <- get("referenceGenomeSequence", envir =
        get(getEnvironment()))
    return(referenceGenomeSequence[[dnaStringSetId]])
}


#' returns a GRanges object corresponding to the currently loaded refgenome
#'
#' This method will return a DNAStringSet object from the reference genome;
#' requires a numeric pointer to the object
#'
#' @return DNAStringSet corresponding to pointer provided
#'
#' @examples
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getReferenceGenomeGR()
#'
#' @export
getReferenceGenomeGR <- function() {
    if (!(exists("referenceGenome", envir = get(getEnvironment())) &&
        exists("referenceGenomeSequence", envir = get(getEnvironment())))) {
            loadReferenceGenome()
    }
    referenceGenome <- get("referenceGenome", envir = get(getEnvironment()))
    referenceGenomeSequence <- get("referenceGenomeSequence", envir =
        get(getEnvironment()))
    return(GRanges(seqnames=referenceGenome[,1],
        IRanges(start=1, end=width(referenceGenomeSequence)[
        referenceGenome[,2]])))
}



#' get chromosome identifiers from reference genome
#'
#' This method will return a vector of chromsome identifiers
#'
#' @return vector of names
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getChromosomeIds()
#'
#' @export
getChromosomeIds <- function() {
    if (!(exists("referenceGenome", envir = get(getEnvironment())) &
        exists("referenceGenomeSequence",
        envir = get(getEnvironment())))) {
        loadReferenceGenome()
    }
    return(get("referenceGenome", envir = get(getEnvironment()))[, 1])
}


#' get chromosome lengths for a named set of chromosomes
#'
#' This method will return a vector of named chromosome lengths
#'
#' @param x vector of chromosome ids
#' @return vector of names
#'
#' @examples
#' init()
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' getSeqLengths(getChromosomeIds())
#'
#' @export
getSeqLengths <- function(x) {
    if (!(exists("referenceGenome", envir = get(getEnvironment())) &&
        exists("referenceGenomeSequence",
        envir = get(getEnvironment())))) {
        loadReferenceGenome()
    }
    keys <- gsub("\\s.+", "", names(get("referenceGenomeSequence", envir =
        get(getEnvironment()))))
    wdata <- width(get("referenceGenomeSequence", envir =
        get(getEnvironment())))[match(x, keys)]
    names(wdata) <- x
    return(wdata)
}


#' prepare mapping summary information by chromosome
#'
#' This method will return a data.frame of basic per chromosome mapping
#' statistics
#'
#' @importFrom gtools mixedsort
#' @importFrom scales comma
#' @importFrom Biostrings letterFrequency
#' @import magrittr
#' @importFrom S4Vectors mcols
#' @import GenomeInfoDb
#' @importFrom GenomicRanges tileGenome
#' @importFrom GenomicRanges binnedAverage
#' @importFrom GenomicRanges coverage
#' @importFrom IRanges IRanges
#' @importFrom GenomicRanges GRanges
#' @importFrom dplyr filter
#' @importFrom scales comma_format
#' @importFrom rlang .data
#' @importFrom gtools mixedsort
#' @param chrIds vector of chromosome ids
#' @param bamFile to the bamFile used for analysis
#' @param flag to define whether mapping is reported at the Primary level
#' @return data.frame of mapping characteristics
#'
#' @examples
#' init()
#' demoBam <- system.file("extdata",
#'     "Ecoli_zymo_R10_filt_subs.bam",
#'     package = "nanopoRe")
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' chromosomeMappingSummary(getChromosomeIds(), demoBam)
#'
#' @export
chromosomeMappingSummary <- function(chrIds, bamFile, flag = "Primary") {

    bamSummary <- bamSummarise(bamFile, blockSize = 10000)

    getChrData <- function(id, bamSummary, flag = "Primary") {
        dna <- getChromosomeSequence(getStringSetId(id))
        letterFreq <- letterFrequency(dna, c("A", "C", "G", "T", "N"))
        mapChr <- bamSummary %>%
            filter(.data$readFlag == flag & .data$rname == id)

        # depending on the genome used there may be a load of warnings here
        # this is likely due to reads mapping beyond segment boundaries -
        # warnings are masked here since they are expected
        suppressWarnings(gr <- GRanges(seqnames = mapChr$rname, ranges =
            IRanges(start = mapChr$start, end = mapChr$end), strand =
            mapChr$strand, seqlengths = getSeqLengths(levels(mapChr$rname))))
        mc <- coverage(gr)
        bi <- tileGenome(seqlengths(gr), ntile = 1)
        cd <- binnedAverage(bi[[1]], mc, "binned_cov")

        ke <- which(as.character(seqnames(cd)) == id)
        co <- mcols(cd[ke, ])$binned_cov

        return(c(chrId = id, chrLength =(scales::comma_format())(length(dna)),
            `N (%)` =paste0(round(letterFreq["N"]/length(dna) * 100, digits=2)),
            `GC (%)`=paste0(round((letterFreq["G"]+letterFreq["G"])/length(dna)
            *100, digits = 2)), `Mapped Reads` = (scales::comma_format())
            (nrow(mapChr)), `Mapped Bases` = (scales::comma_format())(sum(
            mapChr$coverage *mapChr$qwidth)), `Mean Coverage` = paste0(
            round(co, digits = 2))))
    }

    chromosomeData <- data.frame(t(as.data.frame(lapply(gtools::mixedsort(
        unique(chrIds)), getChrData, bamSummary = bamSummary, flag = flag))),
        stringsAsFactors = FALSE, row.names = NULL)
    return(chromosomeData)
}

sagrudd/nanopoRe documentation built on June 7, 2020, 10:20 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

sagrudd/nanopoRe
Accessory Methods for the Analysis of Oxford Nanopore Technologies DNA Sequence Data

R/ReferenceGenome.R
In sagrudd/nanopoRe: Accessory Methods for the Analysis of Oxford Nanopore Technologies DNA Sequence Data

Defines functions chromosomeMappingSummary getSeqLengths getChromosomeIds getReferenceGenomeGR getChromosomeSequence getStringSetId cleanReferenceGenome loadReferenceGenome getReferenceGenome setReferenceGenome

Documented in chromosomeMappingSummary cleanReferenceGenome getChromosomeIds getChromosomeSequence getReferenceGenome getReferenceGenomeGR getSeqLengths getStringSetId loadReferenceGenome setReferenceGenome

R Package Documentation

Browse R Packages

We want your feedback!

sagrudd/nanopoRe Accessory Methods for the Analysis of Oxford Nanopore Technologies DNA Sequence Data

R/ReferenceGenome.R In sagrudd/nanopoRe: Accessory Methods for the Analysis of Oxford Nanopore Technologies DNA Sequence Data

Defines functions chromosomeMappingSummary getSeqLengths getChromosomeIds getReferenceGenomeGR getChromosomeSequence getStringSetId cleanReferenceGenome loadReferenceGenome getReferenceGenome setReferenceGenome

Documented in chromosomeMappingSummary cleanReferenceGenome getChromosomeIds getChromosomeSequence getReferenceGenome getReferenceGenomeGR getSeqLengths getStringSetId loadReferenceGenome setReferenceGenome

R Package Documentation

Browse R Packages

We want your feedback!

sagrudd/nanopoRe
Accessory Methods for the Analysis of Oxford Nanopore Technologies DNA Sequence Data

R/ReferenceGenome.R
In sagrudd/nanopoRe: Accessory Methods for the Analysis of Oxford Nanopore Technologies DNA Sequence Data