#' Install BBtools
#'
#' @param url (Optional) Default will search for the latest version
#' URL to retrieve bbmap from.
#' @param dest_dir (Optional) Default "bin"
#' Directory to install bbmap within.
#' @param dest.dir DEPRECATED
#' @force Whether existing installs should be forcefully overwritten
#'
#'
#' @return
#' @export
#'
#' @import httr
#' @import utils
#' @examples
bbmap_install <- function(url, dest_dir="bin", dest.dir = "bin", force = FALSE) {
if (!missing("dest.dir")){
warning("Argument dest.dir is deprecated, use dest_dir instead")
dest_dir <- dest.dir
}
if (missing(url)) {
url <- ("https://sourceforge.net/projects/bbmap/files/latest/download")
}
if (!dir.exists(dest_dir)) {
dir.create(dest_dir) # Create first directory
}
if (dir.exists(paste0(dest_dir, "/bbmap")) && force == FALSE) {
message("Skipped as bbmap already exists in directory, to overwrite set force to TRUE")
return(NULL)
} else if (dir.exists(paste0(dest_dir, "/bbmap")) && force == TRUE) {
unlink(paste0(dest_dir, "/bbmap"), recursive = TRUE) # Remove old version
}
destfile <- paste0(file.path(dest_dir, basename(url)),".tar.gz")
if (file.exists(destfile)) {
file.remove(destfile) # Remove old zip file
}
#Download file
httr::GET(url, httr::write_disk(destfile, overwrite=TRUE))
#Unzip
utils::untar(destfile, exdir = dest_dir)
#Remove download
file.remove(destfile)
}
# Demultiplex by primers --------------------------------------------------
#' Demultiplex fusion primers using BBmap Seal
#'
#' @param install (Required) Install location for bbmap
#' @param fwd (Required) Vector of locations of forward reads
#' @param rev (Optional) Vector of locations of reverse reads
#' @param Fbarcodes (Required) Barcodes used in forward reads
#' @param Rbarcodes (Optional) Barcodes used in reverse reads
#' @param restrictleft (Optional) Defaults to the size of the largest primer.
#' Restricts the kmer search for primer sequences to just the left side of the molecule.
#' @param out.dir (Optional) Default "demux"
#' The path to write the output reads.
#' @param kmer (Optional) default the size of the smallest primer will be used.
#' The kmer size to use for primer searching.
#' @param hdist (Optional) Default = 0. The hamming distance (number of substitution errors) allowed for mismatch to the query primer.
#' @param degenerate (Optional) Default TRUE.
#' Option to search for all possible primer combinations for degenerate primers
#' @param force (Optional) Default TRUE
#' Option to overwrite existing output files.
#' @param interleaved (Optional) Default FALSE
#' Option to input interleaved reads
#' @param threads (Optional) Default autodetect
#' Number of CPU threads to use
#' WARNING: Thread detection can fail on cluster computing currently
#' @param mem (Optional) Default autodetect
#' GB of memory to use
#' WARNING: mem detection can fail on cluster computing currently
#'
#' @return
#' @export
#'
#' @import dplyr
#' @import tibble
#' @import stringr
#'
#' @examples
#' #' \dontrun{
#' path <- "run_test/"
#' demuxpath <- file.path(path, "demux") # Filtered forward files go into the path/filtered/ subdirectory
#'
#' fastqFs <- sort(list.files(path, pattern="R1_001.*", full.names = TRUE))
#' fastqRs <- sort(list.files(path, pattern="R2_001.*", full.names = TRUE))
#'
#' bbdemux(install="bin/bbmap", fwd=fastqFs, rev=fastqRs,Fbarcodes = c("GAGGDACW","TGTGGDAC","AGAAGGDAC"),
#' Rbarcodes = c("ACGTRATW","TCCGTRAT","CTGCGTRA"),
#' degenerate=TRUE, out.dir=demuxpath, threads=1, mem=4,
#' hdist=0, force=TRUE)
#' }
#'
bbdemux <- function(install = NULL, fwd, rev = NULL, Fbarcodes = NULL, Rbarcodes = NULL,
restrictleft = NULL, out.dir = "demux", kmer = NULL, hdist = 0, degenerate = TRUE,
force = TRUE, threads = NULL, mem = NULL, interleaved = FALSE) {
nsamples <- length(fwd)
# Create temp files
tmp <- tempdir()
tmplogs <- paste0(tmp, "/bbdemux.log")
tmpout <- paste0(tmp,"/stdout.log")
tmperr <- paste0(tmp,"/stderr.log")
bbtools_seal <- function(install = NULL, fwd, rev = NULL, Fbarcodes = NULL, Rbarcodes = NULL,
restrictleft = NULL, out.dir = "demux", kmer = NULL, hdist = 0, degenerate = TRUE,
force = TRUE, mem = NULL, threads = NULL) {
in1 <- paste0("in=", fwd)
if (!is.null(rev)) {
in2 <- paste0("in2=", rev)
} else {
(in2 <- "")
}
if (!is.null(Fbarcodes) & is.null(Rbarcodes)) {
writeLines(paste0(">Rep", seq(1:length(Fbarcodes)), "\n", Fbarcodes, "\n"), con = "Fprimers.fa")
ref <- "ref=Fprimers.fa"
} else if (!is.null(Fbarcodes) & !is.null(Rbarcodes)) {
writeLines(paste0(">Rep", seq(1:length(Fbarcodes)), "\n", Fbarcodes, "\n"), con = "Fprimers.fa", sep = "")
writeLines(paste0(">Rep", seq(1:length(Rbarcodes)), "\n", Rbarcodes, "\n"), con = "Rprimers.fa", sep = "")
ref <- "ref=Fprimers.fa,Rprimers.fa"
}
pattern <- paste0("pattern=", out.dir, "/", basename(fwd) %>%
stringr::str_split_fixed("\\.", n = 2) %>%
tibble::as_tibble() %>%
dplyr::pull(V1) %>%
stringr::str_replace(pattern = "_R1_", replacement = "_R1R2_"), "_%.fastq.gz")
if (is.numeric(kmer)) {
kmer <- paste0("k=", kmer)
} else {
kmer <- paste0("k=", sort(nchar(c(Fbarcodes, Rbarcodes)), decreasing = FALSE)[1])
}
if (is.numeric(restrictleft)) {
restrictleft <- paste0("restrictleft=", restrictleft)
} else {
restrictleft <- paste0("restrictleft=", sort(nchar(c(Fbarcodes, Rbarcodes)), decreasing = TRUE)[1])
}
if (is.numeric(hdist)) {
hdist <- paste0("hdist=", hdist)
}
if (degenerate == TRUE) {
degenerate <- "copyundefined"
} else {
(degenerate <- "")
}
if (force == TRUE) {
force <- "overwrite=TRUE"
} else {
(force <- "")
}
if (!is.null(threads)) {
threads <- paste0("threads=", threads)
} else {
(threads <- "threads=auto")
}
if (!is.null(mem)) {
mem <- paste0("-Xmx", mem, "g")
} else {
(mem <- "")
}
args <- paste(" -cp ", paste0(install, "/current jgi.Seal"), mem, in1, in2, ref,
restrictleft, pattern, kmer, hdist,
degenerate, force, threads, "kpt=t",
collapse = " "
)
result <- system2(command="java",
args = args,
stdout= tmpout,
stderr= tmperr,
wait=TRUE)
now <- date()
cat(paste0("Executed: ", now, "\n"), file=tmplogs, append=TRUE)
cat(paste0("Sample:\t", fwd, "\n"), file=tmplogs, append=TRUE)
file.append(tmplogs, tmperr)
file.remove(c(tmpout, tmperr))
}
if (nsamples > 1) {
for (i in 1:nsamples) {
bbtools_seal(
install = install, fwd = fwd[i], rev = rev[i], Fbarcodes = Fbarcodes, Rbarcodes = Rbarcodes,
restrictleft = restrictleft, out.dir = out.dir, kmer = kmer, hdist = hdist, degenerate = degenerate,
force = force, threads = threads, mem = mem
)
}
} else if (nsamples == 1) {
bbtools_seal(install, fwd, rev,
Fbarcodes = Fbarcodes, Rbarcodes = Rbarcodes, restrictleft = restrictleft,
out.dir = out.dir, kmer = kmer, hdist = hdist, degenerate = degenerate,
force = force, threads = threads, mem = mem
)
}
#clean up
file.remove("Fprimers.fa")
file.remove("Rprimers.fa")
#parse and return logs
parsed <- parse_bbdemux(tmplogs)
return(parsed)
}
# Trim primers ------------------------------------------------------------
#' Trim primers using BBDuk
#'
#' @param install (Required) Install location for bbmap
#' @param fwd (Required) Vector of locations of forward reads
#' @param rev (Optional) Vector of locations of reverse reads
#' @param primers (Required) Forward and reverse primers to trim
#' @param restrictleft (Optional) Defaults to the size of the largest primer.
#' Restricts the kmer search for primer sequences to just the left side of the molecule.
#' @param checkpairs (Optional) Whether paired end checking should be conducted
#' @param out.dir (Optional) Default "trimmed"
#' The path to write the output reads.
#' @param trim.end (Optional) Default is "left"
#' End of the molecule to trim primers from. "left" will trim primers from the 3' end of both forward and reverse reads.
#' Change to "right" only if the amplicon was too short and the sequencer has read into the other end of the molecule.
#' @param ordered (Optional) Default TRUE
#' Set to TRUE to output reads in same order as input.
#' @param kmer (Optional) default the size of the smallest primer will be used.
#' The kmer size to use for primer searching.
#' @param mink (optional) Default FALSE
#' Look for shorter kmers at read tips down to this length
#' @param hdist (Optional) Default 0. The hamming distance (number of substitution errors) allowed for mismatch to the query primer.
#' @param tpe (Otional) Default TRUE
#' Trim pairs evenly. When kmer right-trimming, trim both reads to the minimum length of either.
#' @param degenerate (Optional) Default TRUE.
#' Option to search for all possible primer combinations for degenerate primers
#' @param force (Optional) Default TRUE
#' Option to overwrite existing output files.
#' @param quality (Optional) Default FALSE
#' Output quality statistics from trimming including:
#' Base composition histogram by position, Quality histogram by position,
#' Count of bases with each quality value, Histogram of average read quality,
#' Read length histogram and Read GC content histogram.
#' @param maxlength (Optional) Default FALSE
#' Remove all reads above a maximum length. Useful for removing reads where no primers were found.
#'
#' @return
#' @export
#'
#' @import dplyr
#' @import tidyr
#' @import readr
#' @import purrr
#' @import stringr
#' @import processx
#'
#'
#' @examples
#' \dontrun{
#' path <- "run_test/"
#'
#' fastqFs <- sort(list.files(path, pattern="R1_001.*", full.names = TRUE))
#' fastqRs <- sort(list.files(path, pattern="R2_001.*", full.names = TRUE))
#'
#'bbtrim(install="bin/bbmap", fwd=fastqFs, rev=fastqRs,
#' primers=c("GGDACWGGWTGAACWGTWTAYCCHCC","GTRATWGCHCCDGCTARWACWGG"),
#' degenerate=TRUE, out.dir="trimmed", ktrim="left", ordered=TRUE,
#' mink=FALSE, hdist=2, maxlength=140, force=TRUE)
#'}
bbtrim <- function(install = NULL, fwd, rev = NULL, primers, checkpairs=FALSE,
restrictleft = NULL, out.dir = "bbduk", trim.end = "left", ordered = TRUE,
kmer = NULL, mink = FALSE, tbo= TRUE, tpe = TRUE, hdist = 0, degenerate = TRUE,
force = TRUE, quality = FALSE, maxlength = NULL, tmp=NULL, quiet=FALSE) {
nsamples <- length(fwd)
# Create temp files
if(is.null(tmp)) {tmp <- tempdir()}
tmplogs <- paste0(tmp, "/bbtrim.log")
if(file.exists(tmplogs)) { file.remove(tmplogs)}
bbduk <- function(install = NULL, fwd, rev = NULL, primers, checkpairs = FALSE,
restrictleft = NULL, out.dir = "bbduk", trim.end = "left", ordered = TRUE,
kmer = NULL, mink = FALSE, tbo= TRUE, tpe = TRUE, hdist = 0, degenerate = TRUE,
force = TRUE, quality = FALSE, maxlength = NULL, tmp=NULL, quiet=FALSE) {
#install <- paste0(install, "/current jgi.BBDuk")
if(!quiet) {message(paste0("Trimming primers from: ", fwd, " and ", rev))}
# Create temp files
if(is.null(tmp)) {tmp <- tempdir()}
tmplogs <- paste0(tmp, "/bbtrim.log")
tmpout <- paste0(tmp,"/stdout.log")
tmperr <- paste0(tmp,"/stderr.log")
in1 <- paste0("in=", fwd)
if (!is.null(rev)) {
in2 <- paste0("in2=", rev)
} else {
(in2 <- "")
}
if (!is.null(primers)) {
literal <- paste0("literal=", paste0(primers, collapse = ","))
} else {
(stop("Primer sequences are required for trimming"))
}
if (is.null(rev)) {
out <- paste0(
"out=", dirname(fwd), "/", out.dir, "/", basename(fwd)
) %>%
stringr::str_replace(pattern = ".fastq", replacement = ".trimmed.fastq")
out1 <- ""
out2 <- ""
} else if (!is.null(rev)) {
out <- ""
out1 <- paste0("out1=", dirname(fwd), "/", out.dir, "/", basename(fwd)) %>%
stringr::str_replace(pattern = ".fastq", replacement = ".trimmed.fastq")
out2 <- paste0("out2=", dirname(rev), "/", out.dir, "/", basename(rev)) %>%
stringr::str_replace(pattern = ".fastq", replacement = ".trimmed.fastq")
}
if (trim.end == "left") {
trim.end <- paste0("ktrim=l")
} else if (trim.end == "right") {
trim.end <- paste0("ktrim=r")
}
if (is.numeric(kmer)) {
kmer <- paste0("k=", kmer)
} else {
(kmer <- paste0("k=", sort(nchar(primers), decreasing = FALSE)[1]))
}
if (is.numeric(maxlength)) {
maxlength <- paste0("maxlength=", maxlength)
} else {
maxlength <- ""
}
if (is.numeric(mink)) {
mink <- paste0("mink=", mink) # Note - mink makes it noticibly slower
} else if (mink == TRUE) {
mink <- paste0("mink=", (sort(nchar(primers), decreasing = FALSE)[1] / 2))
} else if (mink == FALSE) {
mink <- ""
}
if (is.numeric(restrictleft)) {
restrictleft <- paste0("restrictleft=", restrictleft)
} else {
restrictleft <- paste0("restrictleft=", sort(nchar(primers), decreasing = TRUE)[1])
}
if (ordered == TRUE) {
ordered <- "ordered=T"
} else {
(ordered <- "")
}
if (is.numeric(hdist)) {
hdist <- paste0("hdist=", hdist)
}
if (degenerate == TRUE) {
degenerate <- "copyundefined"
} else {
(degenerate <- "")
}
if (force == TRUE) {
force <- "overwrite=TRUE"
} else {
(force <- "")
}
if (tpe == TRUE) {
tpe <- "tpe"
} else {
(tpe <- "")
}
if (tbo == TRUE) {
tbo <- "tbo"
} else {
(tbo <- "")
}
# Check pairing
if(checkpairs == TRUE){
if (.Platform$OS.type == "unix") {
reformat_args <- paste(in1, in2, "vpair", collapse = " ")
# Run Reformatreads
result <- processx::run(command=paste0(install, "/reformat.sh"),
args = reformat_args,
echo=quiet,
echo_cmd = quiet,
spinner=TRUE,
windows_verbatim_args=TRUE,
error_on_status = TRUE,
cleanup_tree = TRUE)
} else{
reformat_args <- paste(" -cp ", paste0(install, "/current jgi.ReformatReads "), in1, in2, "vpair", collapse = " ")
# Run Reformatreads
result <- processx::run(command="java",
args = reformat_args,
echo=quiet,
echo_cmd = quiet,
spinner=TRUE,
windows_verbatim_args=TRUE,
error_on_status = TRUE,
cleanup_tree = TRUE)
}
if(stringr::str_detect(result$stderr, "Names do not appear to be correctly paired.")){
stop(message(paste0("ERROR: ", fwd, " and ", rev, " do not appear to be correctly paired")))
} else ( message("Reads are correctly paired"))
}
# Set up quality tracking
if (quality == TRUE) {
qualnames <- fwd %>% stringr::str_replace(pattern=".fastq.gz", replacement="") %>%
basename
qualnames <- paste0(tmp,"/", qualnames)
quality <-
paste0("bhist=", qualnames, "_bhist.txt ",
"qhist=", qualnames, "_qhist.txt ",
"gchist=", qualnames, "_gchist.txt ",
"aqhist=", qualnames, "_aqhist.txt ",
"lhist=", qualnames, "_lhist.txt ",
"gcbins=auto ")
} else {
(quality <- "")
}
if (.Platform$OS.type == "unix") {
args <- paste( in1, in2, literal, restrictleft, out, out1,
out2, kmer, mink, hdist, trim.end,tbo, tpe, degenerate, quality,
maxlength, force, "-da",
collapse = " "
)
result <- system2(command=paste0(install, "/bbduk.sh"),
args = args,
stdout = tmpout,
stderr = tmperr,
wait=TRUE)
}else{
# Run Reformatreads
args <- paste(" -cp ", paste0(install, "/current jgi.BBDuk "), in1, in2, literal, restrictleft, out, out1,
out2, kmer, mink, hdist, trim.end,tbo, tpe, degenerate, quality,
maxlength, force, "-da",
collapse = " "
)
result <- system2(command="java",
args = args,
stdout = tmpout,
stderr = tmperr,
wait=TRUE)
}
now <- date()
cat(paste0("Executed: ", now, "\n"), file = tmplogs, append=TRUE)
cat(paste0("Sample:\t", fwd, "\n"), file = tmplogs, append=TRUE)
file.append(tmplogs, tmperr)
file.remove(c(tmpout, tmperr))
}
if (nsamples > 1) {
for (i in 1:nsamples) {
bbduk(install = install, fwd = fwd[i], rev = rev[i], checkpairs=checkpairs,
primers = primers, restrictleft = restrictleft,
out.dir = out.dir, trim.end = trim.end, ordered = ordered,
kmer = kmer, mink = mink, tbo=tbo, tpe = tpe, hdist = hdist,
degenerate = degenerate, quality = quality,
force = force, maxlength = maxlength, tmp=tmp, quiet=quiet)
}
} else if (nsamples == 1) {
bbduk(install = install, fwd = fwd, rev = rev, checkpairs=checkpairs,
primers = primers, restrictleft = restrictleft,
out.dir = out.dir, trim.end = trim.end, ordered = ordered,
kmer = kmer, mink = mink, tbo=tbo, tpe = tpe, hdist = hdist,
degenerate = degenerate, quality = quality,
force = force, maxlength = maxlength, tmp=tmp, quiet=quiet)
}
#Parse logs
parsed <- parse_bbtrim(tmplogs)
if (quality == TRUE) {
#Base composition histogram by position.
bhist <- parse_bhist(tmp)
#Quality histogram by position.
qhist <- parse_qhist(tmp)
#Histogram of average read quality. - how does this work with binned qscores?
aqhist <- parse_aqhist(tmp)
#Read GC content histogram. - is it worth just reading in the top 4 lines?
gchist <- parse_aqhist(tmp)
#Read length histogram.
lhist <- parse_lhist(tmp)
out <- list(parsed,
bhist,
qhist,
aqhist,
gchist,
lhist)
names(out) <- c("reads", "bp_freq", "quality","avg_quality","gc","length")
} else (out <- parsed)
return(out)
}
# Split interleaved reads -------------------------------------------------
#' Split interleaved reads
#'
#' @param install (Required) Install location for bbmap
#' @param files (Required) Vector of locations of interleaved read files to split
#' @param force (Optional) Default TRUE Option to overwrite existing output files.
#'
#' @return
#' @export
#' @import stringr
#'
bbsplit <- function(install = NULL, files, force = FALSE) {
nsamples <- length(files)
# Create temp files
tmp <- tempdir()
tmplogs <- paste0(tmp, "/bbreformat.log")
tmpout <- paste0(tmp,"/stdout.log")
tmperr <- paste0(tmp,"/stderr.log")
bbtools_reformatreads <- function(install = NULL, file, force = FALSE) {
# Split interleaved reads
out1 <- paste0("out1=", file %>% str_replace(pattern = "_R1R2_", replacement = "_R1_"))
out2 <- paste0("out2=", file %>% str_replace(pattern = "_R1R2_", replacement = "_R2_"))
if (force == TRUE) {
force <- "overwrite=TRUE"
} else {
(force <- "")
}
if (.Platform$OS.type == "unix") {
reformat_args <- paste(file, out1, out2, force, collapse = " ")
result <- system2(command=paste0(install, "/reformat.sh"),
args = reformat_args,
stdout = tmpout,
stderr = tmperr,
wait=TRUE)
} else{
reformat_args <- paste(" -cp ", paste0(install, "/current jgi.ReformatReads "),
file, out1, out2, force, collapse = " ")
# Run Reformatreads
result <- system2(command="java",
args = reformat_args,
stdout=tmpout,
stderr=tmperr,
wait=TRUE)
}
now <- date()
file.append(tmplogs, tmperr)
file.remove(c(tmpout, tmperr))
}
if (nsamples > 1) {
for (i in 1:nsamples) {
bbtools_reformatreads(install = install, file = files[i], force = force)
file.remove(files[i])
}
} else if (nsamples == 1) {
bbtools_reformatreads(install = install, file = files, force = force)
file.remove(files)
}
}
# Parse bbtrim logs ----------------------------------------------------------
#' Parse appended bbtrim logs into tidy format
#'
#' @param x (Required) an appended bbtrim log file generated by bbtrim
#'
#' @return Returns a tidy data frame
#'
#' @export
#' @import dplyr
#' @import tidyr
#' @import readr
#' @import stringr
#' @examples
parse_bbtrim <- function(x) {
lines <- readLines(x)
if (any(stringr::str_detect(lines, "Sample:"))){
sample <- readr::read_tsv(lines[which(stringr::str_sub(lines, 1, 7) == 'Sample:' )], col_names = FALSE) %>%
tidyr::separate(X2, into="sample", sep="\\.", extra="drop") %>%
dplyr::select(sample)
} else (stop("Error: Sample not found in log file, try deleting temporary files"))
if (any(stringr::str_detect(lines, "Input:"))){
input <- readr::read_tsv(lines[which(stringr::str_sub(lines, 1, 6) == 'Input:' )], col_names = FALSE) %>%
tidyr::separate(X2, into="input_reads", sep=" ", extra="drop") %>%
tidyr::separate(X4, into="input_bases", sep=" ", extra="drop") %>%
dplyr::select(input_reads, input_bases) %>%
dplyr::mutate_if(is.character, as.numeric)
} else (stop("Error: Input not found in log file, try deleting temporary files"))
if (any(stringr::str_detect(lines, "KTrimmed:"))){
ktrimmed <- readr::read_tsv(lines[which(stringr::str_sub(lines, 1, 9) == 'KTrimmed:' )], col_names = FALSE) %>%
tidyr::separate(X2, into="ktrimmed_reads", sep=" ", extra="drop") %>%
tidyr::separate(X3, into="ktrimmed_bases", sep=" ", extra="drop") %>%
dplyr::select(ktrimmed_reads, ktrimmed_bases) %>%
dplyr::mutate_if(is.character, as.numeric)
} else (stop("Error: Ktrimmed not found in log file, try deleting temporary files"))
if (any(stringr::str_detect(lines, "Result:"))){
result <- readr::read_tsv(lines[which(stringr::str_sub(lines, 1, 7) == 'Result:' )], col_names = FALSE) %>%
tidyr::separate(X2, into="output_reads", sep=" ", extra="drop") %>%
tidyr::separate(X3, into="output_bases", sep=" ", extra="drop") %>%
dplyr::select(output_reads, output_bases) %>%
dplyr::mutate_if(is.character, as.numeric)
} else (stop("Error: Result not found in log file, try deleting temporary files"))
out <- cbind(sample, input, ktrimmed, result)
return(out)
}
# Parse bbdemux -----------------------------------------------------------
#' Parse appended bbdemux logs into tidy format
#'
#' @param x (Required) an appended bbdemux log file generated by bbdemux
#'
#' @return Returns a tidy data frame
#' @import dplyr
#' @import tidyr
#' @import readr
#'
#' @examples
parse_bbdemux <- function(x) {
lines <- readLines(x)
sample <- readr::read_tsv(lines[which(str_sub(lines, 1, 7) == 'Sample:' )], col_names = FALSE) %>%
tidyr::separate(X2, into="sample", sep="\\.", extra="drop") %>%
dplyr::select(sample)
input <- readr::read_tsv(lines[which(str_sub(lines, 1, 6) == 'Input:' )], col_names = FALSE) %>%
tidyr::separate(X2, into="input_reads", sep=" ", extra="drop") %>%
tidyr::separate(X4, into="input_bases", sep=" ", extra="drop") %>%
dplyr::select(input_reads, input_bases) %>%
dplyr::mutate_if(is.character, as.numeric)
matched <- readr::read_tsv(lines[which(str_sub(lines, 1, 14) == 'Matched reads:' )], col_names = FALSE) %>%
tidyr::separate(X2, into="demulti_reads", sep=" ", extra="drop") %>%
tidyr::separate(X3, into="demulti_bases", sep=" ", extra="drop") %>%
dplyr::select(demulti_reads, demulti_bases) %>%
dplyr::mutate_if(is.character, as.numeric)
out <- cbind(sample, input, matched )
return(out)
}
# parse_bhist -------------------------------------------------------------
#' parse base frequency histograms from bbtrim
#'
#' @param dir (Required) a directory containing base frequency (bhist) files from bbtrim
#'
#' @return a tidy data frame
#' @import purrr
#' @import dplyr
#' @import readr
#' @import stringr
#'
#' @examples
parse_bhist <- function(dir) {
out <- list.files(path=dir, pattern="_bhist.txt", full.names = TRUE) %>%
purrr::set_names() %>%
purrr::map_dfr(readr::read_table2, .id = "Source", col_types = cols(
`#Pos` = col_double(),
A = col_double(),
C = col_double(),
G = col_double(),
T = col_double(),
N = col_double()
)) %>%
dplyr::rename(Pos = `#Pos`) %>%
dplyr::mutate(Source = Source %>%
stringr::str_remove(pattern = "^(.*)\\/") %>%
stringr::str_remove(pattern = "_bhist.txt"))
return(out)
}
# parse_qhist -------------------------------------------------------------
#' Parse quality score histograms from bbtrim
#'
#' @param dir (Required) a directory containing quality score (qhist) files from bbtrim
#'
#' @return a tidy data frame
#' @import purrr
#' @import dplyr
#' @import readr
#' @import stringr
#'
#' @examples
parse_qhist <- function(dir) {
out <- list.files(path=dir, pattern="_qhist.txt", full.names = TRUE) %>%
purrr::set_names() %>%
purrr::map_dfr(readr::read_table2, .id = "Source", col_types = cols(
`#BaseNum` = col_double(),
Read1_linear = col_double(),
Read1_log = col_double(),
Read2_linear = col_double(),
Read2_log = col_double()
)) %>%
dplyr::rename(Pos = `#BaseNum`) %>%
dplyr::mutate(Source = Source %>%
stringr::str_remove(pattern = "^(.*)\\/") %>%
stringr::str_remove(pattern = "_qhist.txt"))
return(out)
}
# parse_aqhist ------------------------------------------------------------
#' Parse average quality histograms from bbtrim
#'
#' @param dir (Required) a directory containing average quality (aqhist) files from bbtrim
#'
#' @return a tidy data frame
#' @import purrr
#' @import dplyr
#' @import readr
#' @import stringr
#'
#' @examples
parse_aqhist <- function(dir) {
out <- list.files(path=dir, pattern="_aqhist.txt", full.names = TRUE) %>%
purrr::set_names() %>%
purrr::map_dfr(readr::read_table2, .id = "Source", col_types = cols(
`#Quality` = col_double(),
count1 = col_double(),
fraction1 = col_double(),
count2 = col_double(),
fraction2 = col_double()
))%>%
dplyr::rename(avg_quality = `#Quality`) %>%
dplyr::mutate(Source = Source %>%
dplyr::str_remove(pattern = "^(.*)\\/")%>%
dplyr::str_remove(pattern = "_aqhist.txt"))
return(out)
}
# parse_gchist ------------------------------------------------------------
#' Parse GC content histograms from bbtrim
#'
#' @param dir (Required) a directory containing average quality (aqhist) files from bbtrim
#'
#' @return a tidy data frame
#' @import purrr
#' @import dplyr
#' @import stringr
#' @import readr
#' @import tidyr
#'
#' @examples
parse_gchist <- function(dir) {
out <- list.files(path=dir, pattern="_gchist.txt", full.names = TRUE) %>%
purrr::set_names() %>%
purrr::map_dfr(readr::read_table2, .id = "Source", n_max=4, col_names = FALSE, col_types = cols(
X1 = col_character(),
X2 = col_double()
)) %>%
dplyr::mutate(X1 = stringr::str_remove(`X1`, pattern="#"),
Source = Source %>%
stringr::str_remove(pattern = "^(.*)\\/")%>%
stringr::str_remove(pattern = "_gchist.txt")) %>%
dplyr::group_by(Source) %>%
tidyr::pivot_wider(names_from = X1, values_from = X2)
return(out)
}
# parse_lhist -------------------------------------------------------------
#' Parse sequence length histograms from bbtrim
#'
#' @param dir (Required) a directory containing length (lhist) files from bbtrim
#'
#' @return a tidy data frame
#' @import purrr
#' @import dplyr
#' @import readr
#' @import stringr
#'
#' @examples
parse_lhist <- function(dir) {
out <- list.files(path=dir, pattern="_lhist.txt", full.names = TRUE) %>%
purrr::set_names() %>%
purrr::map_dfr(readr::read_table2, .id = "Source", col_types= cols(
`#Length` = col_double(),
Count = col_double()
)) %>%
dplyr::rename(Length = `#Length`) %>%
dplyr::mutate(Source = Source %>%
stringr::str_remove(pattern = "^(.*)\\/")%>%
stringr::str_remove(pattern = "_lhist.txt"))
return(out)
}
# bbdemux2 ----------------------------------------------------------------
#' Demultiplex fusion primers using BBmap Seal
#'
#' @param install (Required) Install location for bbmap
#' @param fwd (Required) Vector of locations of forward reads
#' @param rev (Optional) Vector of locations of reverse reads
#' @param Fbarcodes (Required) Barcodes used in forward reads
#' @param Rbarcodes (Optional) Barcodes used in reverse reads
#' @param restrictleft (Optional) Defaults to the size of the largest primer.
#' Restricts the kmer search for primer sequences to just the left side of the molecule.
#' @param out.dir (Optional) Default "demux"
#' The path to write the output reads.
#' @param kmer (Optional) default the size of the smallest primer will be used.
#' The kmer size to use for primer searching.
#' @param hdist (Optional) Default = 0. The hamming distance (number of substitution errors) allowed for mismatch to the query primer.
#' @param degenerate (Optional) Default TRUE.
#' Option to search for all possible primer combinations for degenerate primers
#' @param force (Optional) Default TRUE
#' Option to overwrite existing output files.
#' @param interleaved (Optional) Default FALSE
#' Option to input interleaved reads
#' @param threads (Optional) Default autodetect
#' Number of CPU threads to use
#' WARNING: Thread detection can fail on cluster computing currently
#' @param mem (Optional) Default autodetect
#' GB of memory to use
#' WARNING: mem detection can fail on cluster computing currently
#'
#' @return
#' @export
#'
#' @import dplyr
#' @import tibble
#' @import stringr
#'
bbdemux2 <- function(install = NULL, fwd, rev = NULL, Fbarcodes = NULL, Rbarcodes = NULL, names=NULL,
restrictleft = NULL, out.dir = "demux", kmer = NULL, hdist = 0, degenerate = TRUE,
force = TRUE, mem = NULL, threads = NULL, quiet=FALSE) {
in1 <- paste0("in=", fwd)
if (!is.null(rev)) {
in2 <- paste0("in2=", rev)
} else {
(in2 <- "")
}
# Check if number of F and R primers match
if(!is.null(names)) {
if (!all.equal(length(names), length(Fbarcodes))) {
stop("names must be the same length as Fbarcodes or NULL")
}
}
if (!is.null(Fbarcodes) & is.null(Rbarcodes)) {
if(!is.null(names)){
writeLines(paste0(">",names, "\n", Fbarcodes, "\n"), con = "Fprimers.fa")
} else {
writeLines(paste0(">Rep", seq(1:length(Fbarcodes)), "\n", Fbarcodes, "\n"), con = "Fprimers.fa")
}
ref <- "ref=Fprimers.fa"
} else if (!is.null(Fbarcodes) & !is.null(Rbarcodes)) {
if(!is.null(names)){
writeLines(paste0(">",names, "\n", Fbarcodes, "\n"), con = "Fprimers.fa")
writeLines(paste0(">",names, "\n", Rbarcodes, "\n"), con = "Rprimers.fa")
}else {
writeLines(paste0(">Rep", seq(1:length(Fbarcodes)), "\n", Fbarcodes, "\n"), con = "Fprimers.fa", sep = "")
writeLines(paste0(">Rep", seq(1:length(Rbarcodes)), "\n", Rbarcodes, "\n"), con = "Rprimers.fa", sep = "")
}
ref <- "ref=Fprimers.fa,Rprimers.fa"
}
pattern <- paste0("pattern=", out.dir, "/", basename(fwd) %>%
stringr::str_split_fixed("\\.", n = 2) %>%
tibble::as_tibble() %>%
dplyr::pull(V1) %>%
stringr::str_replace(pattern = "_R1_", replacement = "_R1R2_"), "_%.fastq.gz")
if (is.numeric(kmer)) {
kmer <- paste0("k=", kmer)
} else {
kmer <- paste0("k=", sort(nchar(c(Fbarcodes, Rbarcodes)), decreasing = FALSE)[1])
}
if (is.numeric(restrictleft)) {
restrictleft <- paste0("restrictleft=", restrictleft)
} else {
restrictleft <- paste0("restrictleft=", sort(nchar(c(Fbarcodes, Rbarcodes)), decreasing = TRUE)[1])
}
if (is.numeric(hdist)) {
hdist <- paste0("hdist=", hdist)
}
if (degenerate == TRUE) {
degenerate <- "copyundefined"
} else {
(degenerate <- "")
}
if (force == TRUE) {
force <- "overwrite=TRUE"
} else {
(force <- "")
}
if (!is.null(threads)) {
threads <- paste0("threads=", threads)
} else {
(threads <- "threads=auto")
}
if (!is.null(mem)) {
mem <- paste0("-Xmx", mem, "g")
} else {
(mem <- "")
}
if (.Platform$OS.type == "unix") {
# Parse args to seal
args <- paste(in1, in2, ref,
restrictleft, pattern, kmer, hdist,
degenerate, force, threads, "kpt=t",
collapse = " ")
# Run bbduk using shell script
result <- processx::run(command=paste0(install, "/seal.sh"),
args = args,
echo=!quiet,
echo_cmd = !quiet,
spinner=TRUE,
stderr_to_stdout = FALSE,
windows_verbatim_args=TRUE,
error_on_status = TRUE,
cleanup_tree = TRUE)
} else{
# Parse arguments to seal
args <- paste(" -cp ", paste0(install, "/current jgi.Seal"), mem, in1, in2, ref,
restrictleft, pattern, kmer, hdist,
degenerate, force, threads, "kpt=t",
collapse = " ")
# Run bbmap seal using java
if(!quiet) {message(paste0("Demultiplexing ",length(Fbarcodes)," tagged primers for: ", fwd, " and ", rev))}
result <- processx::run(command="java",
args = args,
echo=!quiet,
echo_cmd = !quiet,
spinner=TRUE,
stderr_to_stdout = FALSE,
windows_verbatim_args=TRUE,
error_on_status = TRUE,
cleanup_tree = TRUE)
}
# Parse stderr which contains read stats
lines <- read_lines(result$stderr)
sample <- read.table(text = lines[1], sep=",") %>%
mutate(sample = basename(V2) %>%
stringr::str_remove("_S.*$")) %>%
dplyr::select(sample)
input <- read.table(text = lines[which(str_sub(lines, 1, 6) == 'Input:' )]) %>%
dplyr::select(input_reads = V2, input_bases=V4)%>%
dplyr::mutate_if(is.character, as.numeric)
matched <- read.table(text = lines[which(str_sub(lines, 1, 14) == 'Matched reads:' )]) %>%
dplyr::select(demulti_reads = V3, demulti_bases=V6)%>%
dplyr::mutate_if(is.character, as.numeric)
#clean up temporary files
file.remove("Fprimers.fa")
file.remove("Rprimers.fa")
return(cbind(sample, input, matched ))
}
# bbtrim2 ------------------------------------------------------------------
#' Trim primers using BBDuk
#'
#' @param install (Required) Install location for bbmap
#' @param fwd (Required) Vector of locations of forward reads
#' @param rev (Optional) Vector of locations of reverse reads
#' @param primers (Required) Forward and reverse primers to trim
#' @param restrictleft (Optional) Defaults to the size of the largest primer.
#' Restricts the kmer search for primer sequences to just the left side of the molecule.
#' @param checkpairs (Optional) Whether paired end checking should be conducted
#' @param out.dir (Optional) Default "trimmed"
#' The path to write the output reads.
#' @param trim.end (Optional) Default is "left"
#' End of the molecule to trim primers from. "left" will trim primers from the 3' end of both forward and reverse reads.
#' Change to "right" only if the amplicon was too short and the sequencer has read into the other end of the molecule.
#' @param ordered (Optional) Default TRUE
#' Set to TRUE to output reads in same order as input.
#' @param kmer (Optional) default the size of the smallest primer will be used.
#' The kmer size to use for primer searching.
#' @param mink (optional) Default FALSE
#' Look for shorter kmers at read tips down to this length
#' @param hdist (Optional) Default 0. The hamming distance (number of substitution errors) allowed for mismatch to the query primer.
#' @param tpe (Otional) Default TRUE
#' Trim pairs evenly. When kmer right-trimming, trim both reads to the minimum length of either.
#' @param degenerate (Optional) Default TRUE.
#' Option to search for all possible primer combinations for degenerate primers
#' @param force (Optional) Default TRUE
#' Option to overwrite existing output files.
#' @param maxlength (Optional) Default FALSE
#' Remove all reads above a maximum length. Useful for removing reads where no primers were found.
#'
#' @return
#' @export
#'
#' @import dplyr
#' @import tidyr
#' @import readr
#' @import purrr
#' @import stringr
#' @import processx
#'
bbtrim2 <- function(install = NULL, fwd, rev = NULL, primers, checkpairs = FALSE,
restrictleft = NULL, out.dir = NULL, trim.end = "left", ordered = TRUE,
kmer = NULL, mink = FALSE, tbo= TRUE, tpe = TRUE, hdist = 0, degenerate = TRUE,
force = TRUE, maxlength = NULL, quiet=FALSE) {
if(!quiet & !is.null(rev)) {message(paste0("Trimming primers for: ", fwd, " and ", rev))
} else if(!quiet & is.null(rev)) {message(paste0("Trimming primers for: ", fwd))}
# Make out dir relative to current directory
if(is.null(out.dir)){
out.dir <- "bbduk"
}
if(!dir.exists(out.dir)){dir.create(out.dir)}
in1 <- paste0("in=", fwd)
if (!is.null(rev)) {
in2 <- paste0("in2=", rev)
} else {
(in2 <- "")
}
if (!is.null(primers)) {
#replace any inosines with N
if(any(stringr::str_detect(primers, "I"))){
primers <- stringr::str_replace(primers,"I", "N")
if(!quiet){message("Replaced I bases in primers with N for trimming")}
}
literal <- paste0("literal=", paste0(primers, collapse = ","))
} else {
(stop("Primer sequences are required for trimming"))
}
if (is.null(rev)) {
out <- paste0( "out=", out.dir, "/", basename(fwd)) %>%
stringr::str_replace(pattern = ".fastq", replacement = ".trimmed.fastq")
out1 <- ""
out2 <- ""
} else if (!is.null(rev)) {
out <- ""
out1 <- paste0("out1=", out.dir, "/", basename(fwd)) %>%
stringr::str_replace(pattern = ".fastq", replacement = ".trimmed.fastq")
out2 <- paste0("out2=", out.dir, "/", basename(rev)) %>%
stringr::str_replace(pattern = ".fastq", replacement = ".trimmed.fastq")
}
if (trim.end == "left") {
trim.end <- paste0("ktrim=l")
} else if (trim.end == "right") {
trim.end <- paste0("ktrim=r")
}
if (is.numeric(kmer)) {
kmer <- paste0("k=", kmer)
} else {
(kmer <- paste0("k=", sort(nchar(primers), decreasing = FALSE)[1]))
}
if (is.numeric(maxlength)) {
maxlength <- paste0("maxlength=", maxlength)
} else {
maxlength <- ""
}
if (is.numeric(mink)) {
mink <- paste0("mink=", mink) # Note - mink makes it noticibly slower
} else if (mink == TRUE) {
mink <- paste0("mink=", (sort(nchar(primers), decreasing = FALSE)[1] / 2))
} else if (mink == FALSE) {
mink <- ""
}
if (is.numeric(restrictleft)) {
restrictleft <- paste0("restrictleft=", restrictleft)
} else {
restrictleft <- paste0("restrictleft=", sort(nchar(primers), decreasing = TRUE)[1])
}
if (ordered == TRUE) {
ordered <- "ordered=T"
} else {
(ordered <- "")
}
if (is.numeric(hdist)) {
hdist <- paste0("hdist=", hdist)
}
if (degenerate == TRUE) {
degenerate <- "copyundefined"
} else {
(degenerate <- "")
}
if (force == TRUE) {
force <- "overwrite=TRUE"
} else {
(force <- "")
}
if (tpe == TRUE) {
tpe <- "tpe"
} else {
(tpe <- "")
}
if (tbo == TRUE) {
tbo <- "tbo"
} else {
(tbo <- "")
}
# Check pairing
if(checkpairs == TRUE){
if (.Platform$OS.type == "unix") {
reformat_args <- paste(in1, in2, "vpair", collapse = " ")
# Run Reformatreads
result <- processx::run(command=paste0(install, "/reformat.sh"),
args = reformat_args,
echo=!quiet,
echo_cmd = !quiet,
spinner=TRUE,
windows_verbatim_args=TRUE,
error_on_status = TRUE,
cleanup_tree = TRUE)
} else{
reformat_args <- paste(" -cp ", paste0(install, "/current jgi.ReformatReads "), in1, in2, "vpair", collapse = " ")
# Run Reformatreads
result <- processx::run(command="java",
args = reformat_args,
echo=!quiet,
echo_cmd = !quiet,
spinner=TRUE,
windows_verbatim_args=TRUE,
error_on_status = TRUE,
cleanup_tree = TRUE)
}
if(stringr::str_detect(result$stderr, "Names do not appear to be correctly paired.")){
stop(message(paste0("ERROR: ", fwd, " and ", rev, " do not appear to be correctly paired")))
} else ( message("Reads are correctly paired"))
}
if (.Platform$OS.type == "unix") {
# Parse args to bbduk
args <- paste( in1, in2, literal, restrictleft, out, out1,
out2, kmer, mink, hdist, trim.end,tbo, tpe, degenerate,
maxlength, force, "-da",
collapse = " "
)
# Run bbduk using shell script
result <- processx::run(command=paste0(install, "/bbduk.sh"),
args = args,
echo=!quiet,
echo_cmd = !quiet,
spinner=TRUE,
stderr_to_stdout = FALSE,
windows_verbatim_args=TRUE,
error_on_status = TRUE,
cleanup_tree = TRUE)
} else{
# Parse args to bbduk
args <- paste(" -cp ", paste0(install, "/current jgi.BBDuk "), in1, in2, literal, restrictleft, out, out1,
out2, kmer, mink, hdist, trim.end,tbo, tpe, degenerate,
maxlength, force, "-da",
collapse = " "
)
# Run bbduk using java
result <- processx::run(command="java",
args = args,
echo=!quiet,
echo_cmd = !quiet,
spinner=TRUE,
stderr_to_stdout = FALSE,
windows_verbatim_args=TRUE,
error_on_status = TRUE,
cleanup_tree = TRUE)
}
# parse stderr containing results
lines <- read_lines(result$stderr)
sample <- read.table(text = lines[1], sep=",") %>%
mutate(sample = V1 %>%
stringr::str_remove("^.*in=") %>%
basename() %>%
stringr::str_remove("_S.*$")) %>%
dplyr::select(sample)
input <- read.table(text = lines[which(str_sub(lines, 1, 6) == 'Input:' )]) %>%
dplyr::select(input_reads = V2, input_bases=V4)%>%
dplyr::mutate_if(is.character, as.numeric)
ktrimmed <- read.table(text = lines[which(stringr::str_sub(lines, 1, 9) == 'KTrimmed:' )]) %>%
dplyr::select(ktrimmed_reads = V2, ktrimmed_bases=V5)%>%
dplyr::mutate_if(is.character, as.numeric)
output <- read.table(text = lines[which(stringr::str_sub(lines, 1, 7) == 'Result:' )]) %>%
dplyr::select(output_reads = V2, output_bases=V5)%>%
dplyr::mutate_if(is.character, as.numeric)
return(cbind(sample, input, ktrimmed, output))
}
# bbsplit2 -----------------------------------------------------------------
#' Split interleaved reads
#'
#' @param install (Required) Install location for bbmap
#' @param files (Required) Vector of locations of interleaved read files to split
#' @param force (Optional) Default TRUE Option to overwrite existing output files.
#'
#' @return
#' @export
#' @importFrom stringr str_replace
#' @importFrom processx run
#'
bbsplit2 <- function(install = NULL, file, force = FALSE, quiet=FALSE) {
# Split interleaved reads
out1 <- paste0("out1=", file %>% stringr::str_replace(pattern = "_R1R2_", replacement = "_R1_"))
out2 <- paste0("out2=", file %>% stringr::str_replace(pattern = "_R1R2_", replacement = "_R2_"))
if(!quiet) {message(paste0("De-interleaving reads for: ", file))}
if (force == TRUE) {
force <- "overwrite=TRUE"
} else {
(force <- "")
}
if (.Platform$OS.type == "unix") {
reformat_args <- paste(file, out1, out2, force, collapse = " ")
# Run Reformatreads using shell script
processx::run(command=paste0(install, "/reformat.sh"),
args = reformat_args,
echo=!quiet,
echo_cmd = !quiet,
spinner=TRUE,
stderr_to_stdout = FALSE,
windows_verbatim_args=TRUE,
error_on_status = TRUE,
cleanup_tree = TRUE)
file.remove(file)
} else{
reformat_args <- paste(" -cp ", paste0(install, "/current jgi.ReformatReads "),
file, out1, out2, force, collapse = " ")
# Run Reformatreads using java
processx::run(command="java",
args = reformat_args,
echo=!quiet,
echo_cmd = !quiet,
spinner=TRUE,
stderr_to_stdout = FALSE,
windows_verbatim_args=TRUE,
error_on_status = TRUE,
cleanup_tree = TRUE)
file.remove(file)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.