#' Merge fastq lane files
#'
#' \code{merge_lanes} Identifies, reads and merges fastq lane files.
#'
#' Given an input path were flow cell lane files in fastq format (typically
#' generated by for example Illumina sequencers) this function will try to
#' identify unique samples among the lane files and merge all lane files for
#' each sample. The function uses md5 checks to control that each lane file is
#' unique (no accidental copies) and will compare expected outcomes given that
#' all samples should have the same number of lanes (comming from the same
#' experiment). If this function does not work, try defining number of lanes
#' to be merged by nlanes.
#'
#' @family PAC generation
#'
#' @seealso \url{https://github.com/Danis102} for updates on the current
#' package.
#'
#' @param in_path Character string with the path to lane files that should be
#' merged. The names of the lane files should follow the convention:
#' 'sample1_lane1.fastq.gz, sample1_lane2.fastq.gz, sample2_lane1.fastq.gz,
#' etc', where the first part of the name indicate the sample while the second
#' part indicate lanes. The function will automatically try to identify sample
#' names based on that the second part is "_lane" or "_L00". The function only takes fastq.gz
#' compressed files.
#'
#' @param out_path Character string with the path to destination folder for
#' merged fastq.gz files.
#'
#' @param threads Integer indicating the number of parallel processes that
#' should be used.
#'
#' @param nlanes Integer indicating the number of lanes that should be merged.
#' This works as a safety measure to ensure correct lane merging if names are
#' long or complicated. Default=NULL.
#'
#' @return Merged fastq files in destination folder.
#'
#' @examples
#'
#' ## The simple principle:
#' # in_path <- "/some/path/to/lane/files/fastq.gz"
#' # out_path <- "/some/path/to/merged/files/"
#' # merge_lanes(in_path, out_path, threads=12)
#'
#'
#' ## Real example
#' # First generate some correct file names (see above).
#' sys_path = system.file("extdata", package = "seqpac", mustWork = TRUE)
#' fq <- list.files(path = sys_path, pattern = "fastq", all.files = FALSE,
#' full.names = TRUE)
#'
#' # Create an output folder
#' input <- paste0(tempdir(), "/lanes/")
#' output <- paste0(tempdir(), "/merged/")
#' dir.create(input, showWarnings=FALSE)
#' dir.create(output, showWarnings=FALSE)
#' # Fix compatible file names
#' file.copy(from = fq, to = input)
#' old_fls <- list.files(input, full.names=TRUE)
#' new_sample <- c(rep("sample1_", times=3), rep("sample2_", times=3))
#' new_lane <- rep(c("lane1","lane2","lane3"), times=2)
#' new_fls <- paste0(input,new_sample, new_lane, ".fastq.gz")
#' file.rename(from = old_fls, to = new_fls)
#'
#' # Then merge the fastq files
#' merge_lanes(input, output, threads=2)
#'
#' # You will find the files in:
#' input
#' output
#'
#'
#' ##-----------------------------------------##
#' ## Warning: Clean up temp folder ##
#' # (Sometimes needed for automated examples)
#'
#' closeAllConnections()
#' fls_temp <- tempdir()
#' fls_temp <- list.files(fls_temp, recursive=TRUE, full.names = TRUE)
#' suppressWarnings(file.remove(fls_temp))
#'
#' @export
merge_lanes <- function(in_path, out_path, threads=1, nlanes=NULL){
j <- NULL
fls <- list.files(in_path, pattern=".fastq.gz", recursive = TRUE)
fls_full <- list.files(in_path, pattern=".fastq.gz", full.names = TRUE, recursive = TRUE)
# Error if no files are found
if(length(fls) < 1){
stop("\nNo fastq files were found. Double check the input folder",
"\nand make sure that fastq is in the correct format (.fastq.gz).")
}
# Check md5
test <- list(NULL)
md5 <- lapply(as.list(fls_full), function(x){
fstq <- data.table::fread(x, header=FALSE, nrows=1000)
test <- digest::digest(fstq, algo="md5")
return(test)
})
if(any(duplicated(unlist(md5)))){
stop("\nTwo or more fastq files were identical (md5 check). Possible ",
"\nreason: you have mistakenly made two copies of the same file.")
}
# Fix name vy trimming in the end until shorter unique
fls_nam <- fls
length(fls_nam) <- length(fls)
if(!is.null(nlanes)){
fls_nam<-fls_nam[seq(1, length(fls_nam), nlanes)]
}
test <- unique(gsub("_L00.*|_lane.*|_Lane.*", "", fls_nam))
fls_nam<-unique(test)
if(!is.null(nlanes)){
if(!length(fls_nam)==(length(fls)/nlanes)){
stop("\nThe amount of uniquely found names are not corresponding to
defined number of lanes (nlanes).")
}
}
# trim further and test if still valid
test <- unique(gsub("_L00.*", "", fls_nam))
if(length(test) == length(fls_nam)){
fls_nam <- test
}
# Identify and merge files of unique samples save in output
doParallel::registerDoParallel(threads)
`%dopar%` <- foreach::`%dopar%`
done <- foreach::foreach(j=seq.int(length(fls_nam))) %dopar% {
fl_base<- fls_nam[j]
lns <- which(grepl(fl_base, fls_full))
fsp_nam <- paste0(fl_base, ".fastq.gz")
fsp_nam <- sub(".*/", "", fsp_nam)
out_nam <- file.path(out_path, fsp_nam)
for(i in seq.int(length(lns))){
if(i == 1){
file.copy(fls_full[lns[i]], out_nam)
}else{
file.append(out_nam, fls_full[lns[i]])
}
}
}
out_path <-return("TRUE")
doParallel::stopImplicitCluster()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.