R/fast5.R

Defines functions extract5mCProbabilities fast5parseLegend extractModifiedBasesFromFast5 extractMethylatedBases loadMethylationBamFile extract5mC extractH5Modstemplate

Documented in extract5mC extract5mCProbabilities extractMethylatedBases loadMethylationBamFile

#' @importFrom rhdf5 h5readAttributes
#' @importFrom xts .parseISO8601
#' @importFrom dplyr last
extractH5Modstemplate <- function(read_id, h5content, fast5file) {
    sample <- h5content[grep(read_id, h5content$group),]
    # get the BaseModProbs ...
    modBaseProbs <- which(sample$name=="ModBaseProbs")
    if (length(modBaseProbs)==0) {
        stop(paste0("No Base Modification table in Fast5 file element [",sample,"]"))
    } else if (length(modBaseProbs)==1) {
        return(basename(dirname(sample[modBaseProbs,"group"])))
    } else {
        # there are multiple methylation calls in this dataset - choose the most recent
        # https://en.wikipedia.org/wiki/ISO_8601#Combined_date_and_time_representations
        times <- lapply(modBaseProbs, function(x) {
            xts::.parseISO8601(
                gsub("Z$", "", rhdf5::h5readAttributes(fast5file, dirname(sample[x,"group"]))$time_stamp), tz="UTC")$first.time})
        pos <- modBaseProbs[dplyr::last(order(unlist(times)))]
        return(basename(dirname(sample[pos,"group"])))
    }
}




#' prepares a data.frame of per-read methylC bases from provided fast5 file
#'
#' This method will parse a fast5 file to extract the 5mC bases classified as 5mC with a
#' probability >= the provided threshold_5mc variable
#'
#' @param fast5file is the fast5 file to parse
#' @param threshold_5mc is the cutoff to apply to values returned
#' @param fast5files is a vector of files being processed (for pretty logging)
#' @param force to force a recalculation
#' @param ... beyond
#'
#' @return data.frame of coordinates and sequence names from fast5
#'
#' @export
extract5mC <- function(fast5file, threshold_5mc = 0.85, fast5files=NULL, force=FALSE, ...) {
    fast5parseLegend(fast5file, fast5files)
    baseProbResults <- file.path(getRpath(), paste0(digest::digest(fast5file, algo="md5", file = FALSE), ".modbaseprobs.", threshold_5mc, ".Rdata"))
    if (file.exists(baseProbResults) && !force) {
        return(invisible(readRDS(file = baseProbResults)))
    }

    mod_data <- extractModifiedBasesFromFast5(fast5file, force=force, ...)
    mod_data <- mod_data[which(mod_data$prob_5mC>=threshold_5mc),]

    saveRDS(mod_data, file = baseProbResults, compress=FALSE)

    return(invisible(mod_data))
}





#' prepares a data.frame of read mapping context for methylation parsing
#'
#' This method will parses the specified BAM file to extract CIGAR and other coordinates for
#' the purpose of selecting sequence bases that are base modified
#'
#' @importFrom Rsamtools ScanBamParam
#' @importFrom Rsamtools BamFile
#' @importFrom Rsamtools scanBam
#' @importFrom Rsamtools bamFlagAsBitMatrix
#' @param bamFile is the bam file to parse
#' @param chrId is the identifier for the chromosome to parse
#' @param force is logical specifying whether the load should be forced (recalculated)
#'
#' @return data.frame
#'
#' @export
loadMethylationBamFile <- function(bamFile, chrId, force=FALSE) {
    print(paste0("parsing bam [",bamFile,"@",chrId,"]"))

    bamParseResults <- file.path(getRpath(), paste0(digest::digest(bamFile, algo="md5", file = FALSE), ".chr.", chrId, ".Rdata"))
    if (file.exists(bamParseResults) && !force) {
        bamChunk <- readRDS(file = bamParseResults)
        return(invisible(bamChunk))
    }

    what = c("qname", "flag", "rname", "strand", "pos", "qwidth", "cigar")
    param = Rsamtools::ScanBamParam(which = GenomicRanges::GRanges(seqnames = chrId, ranges = IRanges::IRanges(start = 1, end = as.numeric(getSeqLengths(chrId)))),
                                    what = what)

    bam <- open(Rsamtools::BamFile(bamFile))
    reads = Rsamtools::scanBam(bam, param = param)[[1L]]

    close(bam)

    # dispose of the secondary, supplementary and unmapped reads
    bamChunk <- as.data.frame(reads)
    flagMatrix <- as.data.frame(Rsamtools::bamFlagAsBitMatrix(as.integer(bamChunk$flag)))

    secondary <- which(flagMatrix$isSecondaryAlignment==1)
    supplemen <- which(flagMatrix$isSupplementaryAlignment==1)
    unmapped <- which(flagMatrix$isUnmappedQuery==1)

    drop <- unique(append(append(secondary, supplemen), unmapped))
    bamChunk <- bamChunk[-drop, ]

    saveRDS(bamChunk, file = bamParseResults, compress=FALSE)

    return(invisible(bamChunk))
}




#' prepares a data.frame of read mapping context for methylation parsing
#'
#' This method will parses the specified BAM file to extract CIGAR and other coordinates for
#' the purpose of selecting sequence bases that are base modified
#'
#' @importFrom Rsamtools ScanBamParam
#' @importFrom Rsamtools BamFile
#' @importFrom Rsamtools scanBam
#' @importFrom Rsamtools bamFlagAsBitMatrix
#' @param x is a numeric pointer
#' @param filteredChunk corresponds to parsed BAM file data.frame
#' @param modifications_df corresponds to parsed methylation data.frame
#'
#' @return data.frame
#'
#' @export
extractMethylatedBases <- function(x, filteredChunk, modifications_df) {
    mapData <- filteredChunk[x,]
    variants <- which(modifications_df$read_id == mapData$qname)
    varData <- modifications_df[variants,]

    if (mapData$strand == '+') {
        referenceCoordinates <- cigarQ2R(varData$position, mapData$cigar)  + mapData$pos
        op <- cigarQ2Op(varData$position, mapData$cigar)
    } else {
        referenceCoordinates <- cigarQ2R((mapData$qwidth - varData$position - 1), mapData$cigar)  + mapData$pos
        op <- cigarQ2Op((mapData$qwidth - varData$position - 1), mapData$cigar)
    }

    # there is a sneaky exception case where NA values creep in ...
    if (NA %in% referenceCoordinates) {
        op <- op[-which(is.na(referenceCoordinates))]
        varData <- varData[-which(is.na(referenceCoordinates)),]
        referenceCoordinates <- referenceCoordinates[-which(is.na(referenceCoordinates))]
    }

    pdata <- NA

    if (length(referenceCoordinates)>0) {
        bases <- getChromosomeSequence(getStringSetId(mapData$rname))[referenceCoordinates]
        if (mapData$strand == '+') {
            bases <- Biostrings::complement(bases)
        }
        rnt <- strsplit(as.character(bases), "")[[1]]
        pdata <- data.frame(chr=as.character(mapData$rname),
                            pos=referenceCoordinates,
                            fwd=as.integer(mapData$strand == '+'),
                            rev=as.integer(mapData$strand == '-'),
                            op=as.character(op),
                            A=as.integer(rnt=='A'),
                            C=as.integer(rnt=='C'),
                            G=as.integer(rnt=='G'),
                            T=as.integer(rnt=='T'),
                            prob=varData$prob_5mC,
                            seq_context=as.character(varData$seq_context),
                            stringsAsFactors = FALSE)
        pdata <- pdata[which(pdata$op == "M"), ]
    }

    return(invisible(pdata))
}




extractModifiedBasesFromFast5 <- function(fast5file, mc.cores=(parallel::detectCores()-1), motif="CG", force=FALSE, parallel=TRUE) {

    baseModResults <- file.path(getRpath(), paste0(digest::digest(fast5file, algo="md5", file = FALSE), ".basemods", ".Rdata"))
    if (file.exists(baseModResults) && !force) {
        return(invisible(readRDS(file = baseModResults)))
    }
    h5content <- rhdf5::h5ls(fast5file)
    read_ids <- h5content[which(h5content$group == "/"),"name"]

    # use the first read_id to parse out dates of
    read_id <- read_ids[1]
    methtemplate <- extractH5Modstemplate(read_id, h5content=h5content, fast5file=fast5file)

    extract5mCByRead <- function(read_id, fast5file, template="Basecall_1D_000") {
        targetFastq <- paste0("/",read_id,"/Analyses/",template,"/BaseCalled_template/Fastq")
        targetMods <- paste0("/",read_id,"/Analyses/",template,"/BaseCalled_template/ModBaseProbs")

        read_id <- gsub("(^/read_)|(/Analyses.+)", "", targetFastq)

        fastq <- unlist(strsplit(unlist(strsplit(rhdf5::h5read(fast5file, targetFastq), "\n"))[2], ""))

        basemodprob <- data.frame(t(rhdf5::h5read(fast5file, targetMods)))[,4]/255

        mods_df <- data.frame(read_id=read_id, position=seq(length(fastq)), nucleotide=fastq, prob_5mC=basemodprob, stringsAsFactors=FALSE)
        mods_df$CpG <- fastq[mods_df$pos]=="C" & fastq[mods_df$pos+1]=="G"
        mods_df <- mods_df[which(mods_df$CpG),]

        mods_df$seq_context = unlist(lapply(mods_df$pos, function(x) { paste0(fastq[seq(from=max(x-5,1), to=min(x+5, length(fastq)))], collapse="") }))
        return(invisible(mods_df))
    }

    if (parallel==TRUE) {
        mod_data <- dplyr::bind_rows(pbmcapply::pbmclapply(read_ids, extract5mCByRead, fast5file=fast5file, template=methtemplate, mc.cores=mc.cores))
    } else {
        mod_data <- dplyr::bind_rows(lapply(read_ids, extract5mCByRead, fast5file=fast5file, template=methtemplate))
    }
    saveRDS(mod_data, file = baseModResults, compress=FALSE)
    return(mod_data)
}



fast5parseLegend <- function(fast5file, fast5files) {
    if (is.null(fast5files)) {
        cat(paste0("extracting 5mC probabilities from [",fast5file,"]\n"))
    } else {
        cat(paste0("[",which(fast5files==fast5file)," of ",length(fast5files),"] 5mC extract [",fast5file,"]\n"))
    }
}



#' prepares a data.frame of per-read methylC probability counts
#'
#' This method will parse a fast5 file to the distribution of probabilities for C residues
#' actually being 5mC residues
#'
#' @param fast5file is the fast5 file to parse
#' @param fast5files is a vector of files being processed (for pretty logging)
#' @param force to force recalculation
#' @param b the number of sample breaks to make
#' @param baseProbResults target file
#' @param ... beyond
#'
#' @return data.frame of coordinates and sequence names from fast5
#'
#' @export
extract5mCProbabilities <- function(fast5file, fast5files=NULL, force=FALSE, b=33, baseProbResults=NA, ...) {
    fast5parseLegend(fast5file, fast5files)
    if (is.na(baseProbResults)) {
        baseProbResults <- file.path(getRpath(), paste0(digest::digest(paste0(fast5file,b), algo="md5", file = FALSE), ".modprobs.", b, ".Rdata"))
    }
    if (file.exists(baseProbResults) && !force) {
        return(invisible(readRDS(file = baseProbResults)))
    }

    numericSeq <- seq(0-1/b,1,by=1/b)

    mod_data <- extractModifiedBasesFromFast5(fast5file, force=force, ...)
    tdata <- transform(mod_data, group=cut(prob_5mC, breaks=numericSeq))
    rdata <- do.call(data.frame, aggregate(prob_5mC~group, tdata, FUN=function(x) c(Count=length(x), Sum=sum(x))))

    saveRDS(rdata, file = baseProbResults, compress=FALSE)
    return(invisible(rdata))
}
sagrudd/nanopoRe documentation built on June 7, 2020, 10:20 p.m.