R/merge_lanes.R

Defines functions merge_lanes

Documented in merge_lanes

#' Merge fastq lane files 
#'
#' \code{merge_lanes} Identifies, reads and merges fastq lane files.
#'
#' Given an input path were flow cell lane files in fastq format (typically
#' generated by for example Illumina sequencers) this function will try to
#' identify unique samples among the lane files and merge all lane files for
#' each sample. The function uses md5 checks to control that each lane file is
#' unique (no accidental copies) and will compare expected outcomes given that
#' all samples should have the same number of lanes (comming from the same
#' experiment). If this function does not work, try defining number of lanes
#' to be merged by nlanes. 
#' 
#' @family PAC generation
#'
#' @seealso \url{https://github.com/Danis102} for updates on the current
#'   package.
#'
#' @param in_path Character string with the path to lane files that should be
#'   merged. The names of the lane files should follow the convention:
#'   'sample1_lane1.fastq.gz, sample1_lane2.fastq.gz, sample2_lane1.fastq.gz,
#'   etc', where the first part of the name indicate the sample while the second
#'   part indicate lanes. The function will automatically try to identify sample
#'   names based on that the second part is "_lane" or "_L00". The function only takes fastq.gz
#'   compressed files.
#'   
#' @param out_path Character string with the path to destination folder for
#'   merged fastq.gz files.
#'   
#' @param threads Integer indicating the number of parallel processes that
#'   should be used.
#'
#' @param nlanes Integer indicating the number of lanes that should be merged.
#'   This works as a safety measure to ensure correct lane merging if names are
#'   long or complicated. Default=NULL.
#'
#' @return Merged fastq files in destination folder.
#'
#' @examples
#'
#' ## The simple principle: 
#' # in_path <- "/some/path/to/lane/files/fastq.gz"
#' # out_path <- "/some/path/to/merged/files/"
#' # merge_lanes(in_path, out_path, threads=12)
#' 
#' 
#' ## Real example
#' # First generate some correct file names (see above).
#' sys_path = system.file("extdata", package = "seqpac", mustWork = TRUE)
#' fq <- list.files(path = sys_path, pattern = "fastq", all.files = FALSE,
#'                 full.names = TRUE)
#'  
#' # Create an output folder
#' input <- paste0(tempdir(), "/lanes/")
#' output <- paste0(tempdir(), "/merged/")
#' dir.create(input, showWarnings=FALSE)
#' dir.create(output, showWarnings=FALSE)

#' # Fix compatible file names
#' file.copy(from = fq, to = input)
#' old_fls <- list.files(input, full.names=TRUE)
#' new_sample <- c(rep("sample1_", times=3), rep("sample2_", times=3))
#' new_lane <- rep(c("lane1","lane2","lane3"), times=2)
#' new_fls <- paste0(input,new_sample, new_lane, ".fastq.gz")
#' file.rename(from = old_fls, to = new_fls)
#' 
#' # Then merge the fastq files
#' merge_lanes(input, output, threads=2)
#' 
#' # You will find the files in:
#' input
#' output
#'
#' 
#' ##-----------------------------------------##
#' ## Warning: Clean up temp folder           ##
#' # (Sometimes needed for automated examples) 
#' 
#' closeAllConnections()
#' fls_temp  <- tempdir()
#' fls_temp  <- list.files(fls_temp, recursive=TRUE, full.names = TRUE)
#' suppressWarnings(file.remove(fls_temp))
#' 
#' @export

merge_lanes <- function(in_path, out_path, threads=1, nlanes=NULL){
  j <- NULL
  fls <- list.files(in_path, pattern=".fastq.gz", recursive  = TRUE)
  fls_full <- list.files(in_path, pattern=".fastq.gz", full.names = TRUE, recursive  = TRUE)

  # Error if no files are found
  if(length(fls) < 1){
    stop("\nNo fastq files were found. Double check the input folder", 
         "\nand make sure that fastq is in the correct format (.fastq.gz).") 
  }
  
  # Check md5
  test <- list(NULL)
  md5 <- lapply(as.list(fls_full), function(x){
    fstq <- data.table::fread(x, header=FALSE, nrows=1000)
   test <- digest::digest(fstq, algo="md5")
    return(test)
  })
  if(any(duplicated(unlist(md5)))){
    stop("\nTwo or more fastq files were identical (md5 check). Possible ", 
         "\nreason: you have mistakenly made two copies of the same file.")
  }
  
  # Fix name vy trimming in the end until shorter unique
  fls_nam <- fls
  length(fls_nam) <- length(fls)
  if(!is.null(nlanes)){
    fls_nam<-fls_nam[seq(1, length(fls_nam), nlanes)]
  }
  test <- unique(gsub("_L00.*|_lane.*|_Lane.*", "", fls_nam))
  fls_nam<-unique(test)
  if(!is.null(nlanes)){
    if(!length(fls_nam)==(length(fls)/nlanes)){
      stop("\nThe amount of uniquely found names are not corresponding to
             defined number of lanes (nlanes).")
    }
  }

  # trim further and test if still valid
  test <- unique(gsub("_L00.*", "", fls_nam))
  if(length(test) == length(fls_nam)){
    fls_nam <- test
  }
  
  # Identify and merge files of unique samples save in output
  doParallel::registerDoParallel(threads)
  `%dopar%` <- foreach::`%dopar%`
  done <- foreach::foreach(j=seq.int(length(fls_nam))) %dopar% {
    fl_base<- fls_nam[j]
    lns  <- which(grepl(fl_base, fls_full))
    fsp_nam  <- paste0(fl_base, ".fastq.gz")
    fsp_nam <- sub(".*/", "", fsp_nam)
    out_nam  <- file.path(out_path, fsp_nam)
    for(i in seq.int(length(lns))){
        if(i == 1){
          file.copy(fls_full[lns[i]], out_nam)
        }else{
          file.append(out_nam, fls_full[lns[i]])
        }
    }
  }
  out_path <-return("TRUE")
  doParallel::stopImplicitCluster()
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.