#' Finds if a DNA read is poly(A) read or poly(T) read
#'
#' This function reads the data from a fast5 file, and then alings primers to
#' the read to discover if it is a poly(A) or poly(T) read. For poly(A) reads,
#' the function further tests if the read is a complete read -- and not truncated
#' prematurely. The function also find the rough end site of the poly(A) tail,
#' and the rough start site of the poly(T) tail.
#'
#' @param file_path a character string[NA]. Full path of the read whose type is
#' to be determined. Use it if the read is basecalled with Albacore and is of
#' one-read-per-fast5 type.
#' @param dna_datatype a character string ['cdna']. Specify if the read is 'cdna'
#' or pcr-dna'.
#' @param plot_debug a logical [FALSE]. Specifies whether to compute data needed
#' for plotting debug.
#' @param basecalled_with a character string. Specify if the data is from
#''albacore' or 'guppy'
#' @param multifast5 a logical. Set it to TRUE if the file to be processed
#' is multifast5. Set it to FALSE if the file to be processed is a single fast5
#' file
#' @param model a string. Set to 'flipflop' if the basecalling model is flipflop.
#' Set to 'standard' if the basecalling model is standard model.l
#'
#' @param plotting_library a string.
#'
#' @param read_id_fast5_file a list [NA]. A list of 'read_id' and 'fast5_file'
#' path. Use this option when a read from a multifast5 file is to be read. In
#' such a case, you should set file_path to NA, and set multifast5 flag to TRUE.
#' @param ... An other parameter. For future expansion.
#' @examples
#' \dontrun{
#'
#' # 1. If the data is multifast5 cDNA (direct cDNA or amplified cDNA)
#' data basecalled with flip-flop algorithm
#' read_id_fast5_file = list(read_id=read_id, fast5_file=full_path_of_fast5_file)
#' find_dna_tailtype(dna_datatype = 'cdna',
#' multifast5 = TRUE,
#' basecalled_with = 'guppy',
#' model = 'flipflop',
#' read_id_fast5_file = read_id_fast5_file)
#'
#' # 2. If the data is multifast5 pcr-DNA data basecalled with flip-flop
#' algorithm
#' read_id_fast5_file = list(read_id=read_id, fast5_file=full_path_of_fast5_file)
#' find_dna_tailtype(dna_datatype = 'pcr-dna',
#' multifast5=TRUE,
#' basecalled_with = 'guppy',
#' model = 'flipflop',
#' read_id_fast5_file = read_id_fast5_file)
#'
#' # 3. If the data is cDNA (direct cDNA or amplified cDNA) data basecalled with
#' albacore with single fast5 files as output
#' find_dna_tailtype(file_path = full_file_path_of_the_read,
#' dna_datatype = 'cdna',
#' multifast5 = FALSE,
#' basecalled_with = 'albacore',
#' model = 'standard')
#' }
#'
#' @return A list containing all the relevant information
#'
find_dna_tailtype <- function(file_path = NA,
basecall_group = 'Basecall_1D_000',
dna_datatype = 'cdna',
plot_debug = FALSE,
basecalled_with,
multifast5,
model,
read_id_fast5_file = NA,
plotting_library,
...) {
# get the substitution matrix -- if passed from outside
if (length(list(...)) > 0) {
lst <- list(...)
if ("dna_opts" %in% names(lst)) {
dna_opts <- lst$dna_opts
match <- dna_opts$match
mismatch <- dna_opts$mismatch
type <- dna_opts$type
gapOpening <- dna_opts$gapOpening
gapExtension <- dna_opts$gapExtension
submat <- dna_opts$submat
if (dna_datatype == 'custom-cdna') {
fp <- dna_opts$fp
ep <- dna_opts$ep
}
}
} else {
# otherwise make one
match <- 1
mismatch <- -1
type <- 'local'
gapOpening <- 0
gapExtension <- 1
submat <- Biostrings::nucleotideSubstitutionMatrix(match = match,
mismatch = mismatch,
baseOnly = TRUE)
}
# extract read data
read_data <- extract_read_data(file_path,
read_id_fast5_file,
plot_debug,
basecalled_with,
basecall_group,
multifast5,
model,
plotting_library,
experiment_type = 'dna')
# get event data table and the fastQ
#event_data <- read_data$event_data
fastq <- read_data$fastq
# define the adaptor sequences
#fa <- Biostrings::DNAString('GGCGTCTGCTTGGGTGTTTAACCTTTTTTTTTTAATGTACTTCGTTCAGTTACGTATTGCT')
#ea <- Biostrings::DNAString('GCAATACGTAACTGAACGAAGT')
# define the primer sequences
if (dna_datatype == 'cdna') {
# # these are the legacy front and end primers
# fp <- Biostrings::DNAString('TTTCTGTTGGTGCTGATATTGCTGCCATTACGGCCGGG')
# ep <- Biostrings::DNAString('ACTTGCCTGTCGCTCTATCTTC')
# new front and end primers from ONT's PCS110 kit
fp <- Biostrings::DNAString('TTTCTGTTGGTGCTGATATTGCTT')
ep <- Biostrings::DNAString('CTTGCCTGTCGCTCTATCTTCAGAGGAGAGTCCGCCGCCCGCAAG')
threshold <- 0.6
} else if (dna_datatype == 'pcr-dna') {
fp <- Biostrings::DNAString('ATTTAGGTGACACTATAGCGCTCCATGCAAACCTGTC')
ep <- Biostrings::DNAString('GAGTCCGGGCGGCGC')
threshold <- 0.68
} else if (dna_datatype == 'custom-cdna') {
fp <- Biostrings::DNAString(fp)
ep <- Biostrings::DNAString(ep)
threshold <- 0.6
}
#rc_fp <- Biostrings::reverseComplement(fp)
rc_ep <- Biostrings::reverseComplement(ep)
# RECIPE:
# 1. Tail is poly(A) if nas_fp > nas_ep > 0.6 --> check if its not prematurely terminated read
# by checking for rc_ep at the end of the of the read
# 2. Tail is poly(T) if nas_ep > nas_fp > 0.6
# 3. Invalid otherwise
# define a search window width within which to find the ep and fp
if (dna_datatype == 'cdna') {
search_window <- 140 # The ep is longer with ONT's newer PCS110 kit requiring more search window
} else {
search_window <- 100 # For our legacy data and PCR DNA approach
}
as_fp <- Biostrings::pairwiseAlignment(pattern=fp,
subject=Biostrings::DNAString(substr(fastq, start=1, stop=min(search_window, nchar(fastq)))),
substitutionMatrix=submat,
type=type,
scoreOnly=FALSE,
gapOpening=gapOpening,
gapExtension=gapExtension)
as_ep <- Biostrings::pairwiseAlignment(pattern=ep,
subject=Biostrings::DNAString(substr(fastq, start=1, stop=min(search_window, nchar(fastq)))),
substitutionMatrix=submat,
type=type,
scoreOnly=FALSE,
gapOpening=gapOpening,
gapExtension=gapExtension)
nas_fp <- as_fp@score/fp@length
nas_ep <- as_ep@score/ep@length
#nas_rc_ep <- NA # for max remove later
#nas_rc_fp <- NA # for max reomve later
has_precise_boundary <- FALSE
bases_to_match <- 3
# check the front primer and rev comp end primer score to decide
# between polyA and polyT reads.
if (nas_fp > nas_ep & nas_fp > threshold){
read_type <- 'polyA'
# check if there is the end primer at the end of the polyA read
# adjacent to the polyA tail
# define a search window width within which to find the ep and fp
if (dna_datatype == 'cdna') {
fp_search_window <- 80 # The ep is longer with ONT's newer PCS110 kit requiring more search window
} else {
fp_search_window <- 50 # For our legacy data and PCR DNA approach
}
# #adnan
# as_rc_ep <- Biostrings::pairwiseAlignment(pattern=rc_ep,
# subject=Biostrings::DNAString(substr(fastq, start=max(nchar(fastq)-fp_search_window, 0), stop=nchar(fastq))),
# substitutionMatrix=submat,
# type=type,
# scoreOnly=FALSE,
# gapOpening=gapOpening,
# gapExtension=gapExtension)
# nas_rc_ep <- as_rc_ep@score/rc_ep@length
# tail_is_valid <- ifelse(nas_rc_ep > threshold, TRUE, FALSE)
#
# eaiser logic; convert the end of FASTQ to reverse complement
as_rc_ep <- Biostrings::pairwiseAlignment(pattern=ep,
subject=Biostrings::reverseComplement(
Biostrings::DNAString(
substr(fastq, start=max(nchar(fastq)-fp_search_window-1, 0), stop=nchar(fastq))
)
),
substitutionMatrix=submat,
type=type,
scoreOnly=FALSE,
gapOpening=gapOpening,
gapExtension=gapExtension)
nas_rc_ep <- as_rc_ep@score/rc_ep@length
tail_is_valid <- ifelse(nas_rc_ep > threshold, TRUE, FALSE)
# If it is a valid polyA tail, then find the rough starting site of the tail
if (tail_is_valid) {
#adnan
#polya_end_fastq <- as_rc_ep@subject@range@start + nchar(fastq) - fp_search_window - as_rc_ep@pattern@range@start
# easier logic
polya_end_fastq <- nchar(fastq) - as_rc_ep@subject@range@start - as_rc_ep@subject@range@width + as_rc_ep@pattern@range@start + 1
polya_rough_end <- find_sample_index_for_fastq_base(read_data$event_data, polya_end_fastq, read_type)
# for max remove later
# as_rc_fp <- Biostrings::pairwiseAlignment(pattern=rc_fp,
# subject=Biostrings::DNAString(substr(fastq, start=max(nchar(fastq)-50, 0), stop=nchar(fastq))),
# substitutionMatrix=submat,
# type=type,
# scoreOnly=FALSE,
# gapOpening=gapOpening,
# gapExtension=gapExtension)
# check if we have captured the end of the tail perfectly
# by finding out if we captured the bases adjacent to the tails
if ((substr(as_rc_ep@subject,
start=1,
stop=bases_to_match) == substr(rc_ep,
start=1,
stop=bases_to_match)) &
(as_rc_ep@pattern@range@start == 1)) {
has_precise_boundary <- TRUE
}
# for max remove later
# nas_rc_fp <- as_rc_fp@score/rc_fp@length
} else {
polya_end_fastq <- NA
polya_rough_end <- NA
}
polyt_start_fastq <- NA
polyt_rough_start <- NA
# Check if it is a PolyT tail
} else if (nas_fp < nas_ep & nas_ep > threshold) {
read_type <- 'polyT'
tail_is_valid <- T
polyt_start_fastq <- as_ep@subject@range@start + as_ep@subject@range@width
polyt_rough_start <- find_sample_index_for_fastq_base(read_data$event_data, polyt_start_fastq, read_type)
polya_end_fastq <- NA
polya_rough_end <- NA
# check if we have captured the end of the tail perfectly
# by finding out if we captured the bases adjacent to the tails
if (substr(as_ep@subject,
start=nchar(as.character(as_ep@subject))-bases_to_match+1,
stop=nchar(as.character(as_ep@subject))) == substr(ep,
start=ep@length-bases_to_match+1,
stop=ep@length)) {
has_precise_boundary <- TRUE
}
# if the above two checks fail then it is an invalid read
} else {
read_type <- 'invalid'
tail_is_valid <- F
polyt_start_fastq <- NA
polyt_rough_start <- NA
polya_end_fastq <- NA
polya_rough_end <- NA
has_precise_boundary <- NA
}
# df <- data.frame(read_data$raw_data)
# p <- ggplot2::ggplot(data=df) +
# ggplot2::geom_line(ggplot2::aes(x=c(1:length(read_data$raw_data)), y=read_data.raw_data))
#
# if (!is.na(polyt_rough_start)){
# p <- p + ggplot2::geom_vline(xintercept = polyt_rough_start, color='red')
# }
# if (!is.na(polya_rough_end)){
# p <- p + ggplot2::geom_vline(xintercept = polya_rough_end, color='green')
# }
#
# print(p)
# nchar(fastq)
list(read_data = read_data, # for max comment it out
read_type = read_type,
tail_is_valid = tail_is_valid,
polya_end_fastq = polya_end_fastq,
polyt_start_fastq = polyt_start_fastq,
polya_end = polya_rough_end,
polyt_start = polyt_rough_start,
has_precise_boundary = has_precise_boundary
)
#nas_fp=nas_fp,
#nas_rc_ep=nas_rc_ep,
#nas_ep=nas_ep,
#nas_rc_fp=nas_rc_fp))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.