R/fast5Readers.R

Defines functions readFast5Log .getAligned .getRawSignal .getBaseCalled .findAnalysisNumber .getEvents .getReadChannelMux2 .getReadChannelMux .getReadNumber .getSamplingRate .getRawStartDuration .getStartTime .getRunID

Documented in readFast5Log

.getRunID <- function(file) {
    
    fid <- H5Fopen(file)
    on.exit(H5Fclose(fid))
    
    exists <- .groupExistsObj(fid, group = "/UniqueGlobalKey/tracking_id")
    if(!exists) {
        run_id <- NA
    } else {
        gid <- H5Gopen(fid, "/UniqueGlobalKey/tracking_id")   
        aid <- H5Aopen(gid, "run_id")
        run_id <- H5Aread(aid)
        H5Aclose(aid)
        H5Gclose(gid)
    }
    return( run_id )
    
}

.getStartTime <- function(file, samplingRate = NULL, readNum = NULL) {
    
    if(is.character(file)) {
        fid <- H5Fopen(file)
        on.exit(H5Fclose(fid))
    } else {
        fid <- file
    }
    
    start_time <- NA
    
    ## providing the read number might speed things up if we have it already
    if(is.null(readNum)) {
        gid <- H5Gopen(fid, "/Analyses/EventDetection_000/Reads")
        readNum <- h5ls(gid)[1,"name"]
        H5Gclose(gid)
    }
    
    if(.groupExistsObj(fid, group = paste0("/Raw/EventDetection_000/Reads/", readNum))) {
        gid <- H5Gopen(fid, paste0("/Analyses/EventDetection_000/Reads/", readNum)) 
        aid <- H5Aopen(gid, "start_time") 
        start_time <- H5Aread(aid) 
        H5Aclose(aid)
    }
    
    ## start time should be stored under /Raw/Reads, so try here too
    ## for old files this doesn't exist, and sometimes start_time is blank
    if(is.na(start_time) && 
       .groupExistsObj(fid, group = paste0("/Raw/Reads/", readNum))) {
        gid <- H5Gopen(fid, paste0("/Raw/Reads/", readNum)) 
        aid <- H5Aopen(gid, "start_time") 
        start_time <- H5Aread(aid) 
        H5Aclose(aid)
    }
    
    ## start_time is recorded in number of samples. If we have to sampling rate
    ## we can convert this to seconds
    if(!is.null(samplingRate)) {
        start_time <- start_time / samplingRate
    }
    
    return(start_time)
} 

## We can often get temporal information from either event data
## or raw data.  This function gets the start time and durtion
## from the raw data group in the fast5 file
.getRawStartDuration <- function(file, readNum = NULL) {
    
    if(is.character(file)) {
        fid <- H5Fopen(file)
        on.exit(H5Fclose(fid))
    } else {
        fid <- file
    }
    
    start_time <- NA
    
    ## providing the read number might speed things up if we have it already
    ## wont work for now, later code expects just the number not 'Read_N'
    #if(is.null(readNum)) {
    #    gid <- H5Gopen(fid, "/Raw/Reads/")
    #    readNum <- h5ls(gid)[1,"name"]
    #    H5Gclose(gid)
    #}
    
    ## start time should be stored under /Raw/Reads, so try here too
    ## for old files this doesn't exist, and sometimes start_time is blank
    if(is.na(start_time) && 
       .groupExistsObj(fid, group = paste0("/Raw/Reads/Read_", readNum))) {
        gid <- H5Gopen(fid, paste0("/Raw/Reads/Read_", readNum)) 
        aid <- H5Aopen(gid, "start_time") 
        start_time <- H5Aread(aid) 
        H5Aclose(aid)
        
        aid <- H5Aopen(gid, "duration") 
        duration <- H5Aread(aid) 
        H5Aclose(aid)
        H5Gclose(gid)
    }
    
    return( c(start_time, duration) )
} 

## The sampling is how many times the signal is recorded per second.
## We use this to convert the 'duration' and 'start_time' in the event data
## into seconds.  It may also be useful meta data
#' @importFrom stats median
.getSamplingRate <- function(file) {
    
    fid <- H5Fopen(file)
    on.exit(H5Fclose(fid))
    
    exists <- .groupExistsObj(fid, group = "/UniqueGlobalKey/channel_id")
    if(!exists) {
        rate <- NA
    } else {
        gid <- H5Gopen(fid, "/UniqueGlobalKey/channel_id/")   
        aid <- H5Aopen(gid, "sampling_rate")
        rate <- H5Aread(aid)
        H5Aclose(aid)
        H5Gclose(gid)
    }
    return( rate )
}


.getReadNumber <- function(fid) {
    gid <- H5Gopen(fid, "/Analyses/EventDetection_000/Reads")
    on.exit(H5Gclose(gid))
    read_number_char <- h5ls(gid)[1,"name"]
    read_number_char <- stringr::str_replace(string = read_number_char, pattern = "Read_", replacement = "")
    read_number <- as.integer(read_number_char)
    return(read_number)
}

.getReadChannelMux <- function(file) {
    
    if(is.character(file)) {
        fid <- H5Fopen(file)
        on.exit(H5Fclose(fid))
    } else {
        fid <- file
    }
    
    # here we get the starting mux and the read number from the channel
    exists <- .groupExistsObj(fid, group = "/Analyses/EventDetection_000/Reads")
    if(!exists) {
        start_mux <- NA
        read_number <- NA
    } else {
        ## get the Read_No., this changes in every file
        gid <- H5Gopen(fid, "/Analyses/EventDetection_000/Reads")
        read_number_char <- h5ls(gid)[1,"name"]
        H5Gclose(gid)
        
        ## Open the group and read the two attribute we want
        gid <- H5Gopen(fid, paste0("/Analyses/EventDetection_000/Reads/", read_number_char))   
        aid <- H5Aopen(gid, "start_mux")
        start_mux <- H5Aread(aid)
        H5Aclose(aid)
        aid <- H5Aopen(gid, "read_number")
        read_number <- H5Aread(aid)
        H5Aclose(aid)   
        H5Gclose(gid)
    }
    
    # we're also interested in the channel information
    exists <- .groupExistsObj(fid, group = "/UniqueGlobalKey/channel_id")
    if(!exists) {
        channel_number <- NA
    } else {
        gid <- H5Gopen(fid, "/UniqueGlobalKey/channel_id/")   
        aid <- H5Aopen(gid, "channel_number")
        channel_number <- H5Aread(aid)
        H5Aclose(aid)
        H5Gclose(gid)
    }
    
    return( tibble(read = as.integer(read_number), 
                   channel = as.integer(channel_number), 
                   mux = as.integer(start_mux)) ) 
}

.getReadChannelMux2 <- function(file, readNumber = NA, dontCheck = FALSE) {
    
    if(is.character(file)) {
        fid <- H5Fopen(file)
        on.exit(H5Fclose(fid))
    } else {
        fid <- file
    }
    
    # here we get the starting mux and the read number from the channel
    #exists <- .groupExistsObj(fid, group = "/Analyses/EventDetection_000/Reads")
    if(dontCheck || .groupExistsObj(fid, group = "/Analyses/EventDetection_000/Reads")) {
        ## get the Read_No., this changes in every file
        if(is.na(readNumber)) {
            readNumber <- .getReadNumber(fid)
        }
        
        ## Open the group and read the two attribute we want
        gid <- H5Gopen(fid, paste0("/Analyses/EventDetection_000/Reads/Read_", readNumber))   
        aid <- H5Aopen(gid, "start_mux")
        start_mux <- H5Aread(aid)
        H5Aclose(aid)
        H5Gclose(gid)
    } else {
        start_mux <- NA
    }
    
    # we're also interested in the channel information
    if(dontCheck || .groupExistsObj(fid, group = "/UniqueGlobalKey/channel_id")) {
        gid <- H5Gopen(fid, "/UniqueGlobalKey/channel_id/")   
        aid <- H5Aopen(gid, "channel_number")
        channel_number <- H5Aread(aid)
        H5Aclose(aid)
        H5Gclose(gid)
    } else {
        channel_number <- NA
    }
    
    return( tibble(read = as.integer(readNumber), 
                   channel = as.integer(channel_number), 
                   mux = as.integer(start_mux)) ) 
}


.getEvents <- function(file) {
    
    fid <- H5Fopen(file)
    on.exit(H5Fclose(fid))
    
    exists <- .groupExistsObj(fid, group = "/Analyses/EventDetection_000/Reads")
    if(!exists) {
        start_time <- duration <- num_events <- median_signal <- NA
    } else {
        ## get the Read_No., this changes in every file
        gid <- H5Gopen(fid, "/Analyses/EventDetection_000/Reads")
        read_number_char <- h5ls(gid)[1,"name"]
        H5Gclose(gid)
        
        ## Open the group
        gid <- H5Gopen(fid, paste0("/Analyses/EventDetection_000/Reads/", read_number_char)) 
        did <- H5Dopen(gid, "Events")
        events <- as_tibble(H5Dread(did, bit64conversion = "int", compoundAsDataFrame = TRUE))
        H5Dclose(did)
        
        H5Gclose(gid)
    } 
    
    return(events)
}

## we want to determine if there are more than one Basecalled slots in the 
## fast5 file e.g. Basecalled_1D_000 and Basecalled_1D_001 etc.  We will use
## the highest number to read from
#' @importFrom stringr str_match
.findAnalysisNumber <- function(fid, grepString = "Basecall_[12]D_([0-9]+)") {
    
    groups <- unique(h5ls(fid)[,'group'])
    options <- unique(str_match(string = groups, pattern = grepString)[,2])
    return( options[ which.max(as.integer(options)) ] )
}



.getBaseCalled <- function(file, strand = "template") {
    
    fid <- H5Fopen(file)
    on.exit(H5Fclose(fid))
    
    analysisNum <- .findAnalysisNumber(fid)
    
    for(d in c("1D", "2D")) {
        exists <- .groupExistsObj(fid, group = paste0("/Analyses/Basecall_", d,
                                                      "_", analysisNum, 
                                                      "/Summary/basecall_1d_", strand))
        if(exists) break;
    }
    
    if(!exists) {
        events <- NA
    } else {
        ## Open the group and read
        did <- H5Dopen(fid, paste0("/Analyses/Basecall_", d, "_", analysisNum, "/BaseCalled_", strand, "/Events"))   
        events <- as_tibble(H5Dread(did, bit64conversion = "int", compoundAsDataFrame = TRUE))
        H5Dclose(did)
    }

    return(events)  
}

.getRawSignal <- function(file) {

    if(is.character(file)) {
        fid <- H5Fopen(file)
        on.exit(H5Fclose(fid))
    } else {
        fid <- file
    }

    exists <- .groupExistsObj(fid, group = "/Raw/Reads")
    if(!exists) {
        #warning("Raw signal data not found")
        signal <- NA
    } else {
        ## get the Read_No., this changes in every file
        gid <- H5Gopen(fid, "/Raw/Reads")
        read_number_char <- h5ls(gid)[1,"name"]
        H5Gclose(gid)

        ## Open the group
        gid <- H5Gopen(fid, paste0("/Raw/Reads/", read_number_char))
        
        did <- H5Dopen(gid, "Signal")
        signal = H5Dread(did)
        H5Dclose(did)

        ## get the starting time
        aid <- H5Aopen(gid, name = "start_time")
        startTime <- H5Aread(aid)
        H5Aclose(aid)
        
        H5Gclose(gid)

        raw <- tibble(signal, time = seq(startTime, length.out = length(signal)))
    }

    return(raw)
}

.getAligned <- function(file) {
 
    fid <- H5Fopen(file)
    on.exit(H5Fclose(fid))
    
    exists <- .groupExistsObj(fid, group = "/Analyses/Alignment_000/Aligned_2d/SAM")
    
    if(exists) {
        did <- H5Dopen(fid, "/Analyses/Alignment_000/Aligned_2d/SAM")
        samData <- tibble(H5Dread(did))
    } else {
        samData <- ""
    }
    
    return(samData)
}



#' Read the log entry from a single fast5 file
#' 
#' Basecalling procedures performed on fast5 files generally leave a 
#' log file entry recording how far through the pipeline the file 
#' proceeded.  This function will extract this information as a 
#' single string.  It can be printed in a more readable format 
#' using the \code{\link{cat}} function.
#' 
#' @param file Character vector of fast5 file to be read.
#' @return Character vector containing the log information.  
#' \code{NULL} if no log is found.
#' @examples 
#' fast5file <- system.file('extdata', 'example.fast5', package = "IONiseR")
#' log <- readFast5Log(fast5file)
#' cat(log)
#' @export
readFast5Log <- function(file) {

    fid <- H5Fopen(file)
    on.exit(H5Fclose(fid))
    
    log <- NULL
    
    for(d in c("1D", "2D")) {
        path <- paste0("/Analyses/Basecall_", d, "_000/Log")
        exists <- .groupExistsObj(fid, group = path)
        if(exists) {
            did <- H5Dopen(fid, path)
            log <- c(log, H5Dread(did))
        }
    }
    
    if(is.null(log))
        message("No log information found.")
    
    return(log)
}
grimbough/IONiseR documentation built on Sept. 24, 2020, 11:32 a.m.