R/dimsum_stage_vsearch.R

Defines functions dimsum_stage_vsearch

Documented in dimsum_stage_vsearch

#' dimsum_stage_vsearch
#'
#' Run VSEARCH on all fastq files.
#'
#' @param dimsum_meta an experiment metadata object (required)
#' @param vsearch_outpath VSEARCH output path (required)
#' @param report whether or not to generate VSEARCH summary plots (default: TRUE)
#' @param report_outpath VSEARCH report output path
#' @param save_workspace whether or not to save the current workspace (default: TRUE)
#'
#' @return an updated experiment metadata object
#' @export
dimsum_stage_vsearch <- function(
  dimsum_meta,
  vsearch_outpath,
  report = TRUE,
  report_outpath = NULL,
  save_workspace = TRUE
  ){

  #Whether or not to execute the system command
  this_stage <- 3
  execute <- (dimsum_meta[["startStage"]] <= this_stage & dimsum_meta[["stopStage"]] >= this_stage)

  #WRAP not run or this stage after stopStage
  if(!is.null(dimsum_meta[["countPath"]]) | dimsum_meta[["stopStage"]] < this_stage){
    return(dimsum_meta)
  }

  #Save current workspace for debugging purposes
  if(save_workspace){dimsum__save_metadata(dimsum_meta = dimsum_meta, n = 2)}

  #Create/overwrite vsearch directory (if executed)
  vsearch_outpath <- gsub("/$", "", vsearch_outpath)
  dimsum__create_dir(vsearch_outpath, execute = execute, message = "DiMSum STAGE 3 (WRAP): ALIGN PAIRED-END READS") 

  #Input files
  all_fastq <- file.path(dimsum_meta[["exp_design"]][1,"pair_directory"], unique(c(dimsum_meta[['exp_design']][,"pair1"], dimsum_meta[['exp_design']][,"pair2"])))
  #Check if all input files exist
  dimsum__check_files_exist(
    required_files = all_fastq,
    stage_number = this_stage,
    execute = execute)

  #Sample names
  sample_names <- paste0(
    dimsum_meta[["exp_design"]][,"sample_name"], '_e', 
    dimsum_meta[["exp_design"]][,"experiment"], '_s', 
    dimsum_meta[["exp_design"]][,"selection_id"], '_b', 
    dimsum_meta[["exp_design"]][,"biological_replicate"], '_t', 
    dimsum_meta[["exp_design"]][,"technical_replicate"], sep = "")
    # dimsum_meta[["exp_design"]][,"technical_replicate"], '_split', 
    # dimsum_meta[["exp_design"]][,"split"], sep = "")

  #Additional vsearch options related to alignment length
  temp_options <- paste0(' -fastq_minovlen ', dimsum_meta[["vsearchMinovlen"]])
  if( dimsum_meta[["vsearchMinovlen"]] < 10 ){temp_options <- paste0(temp_options, " -fastq_maxdiffs ", dimsum_meta[["vsearchMinovlen"]])}
  # if( dimsum_meta[["vsearchMinovlen"]] < 16 ){temp_options <- paste0(temp_options, " -minhsp ", dimsum_meta[["vsearchMinovlen"]])}
  # if( dimsum_meta[["vsearchMinovlen"]] < 16 ){temp_options <- paste0(temp_options, " -band ", dimsum_meta[["vsearchMinovlen"]])}
  # if( dimsum_meta[["vsearchMinovlen"]] < 5 ){temp_options <- paste0(temp_options, " -hspw ", dimsum_meta[["vsearchMinovlen"]])}

  #Run VSEARCH on all fastq file pairs
  dimsum__status_message("Aligning paired-end FASTQ files with VSEARCH:\n")
  dimsum__status_message(paste0(all_fastq, "\n"))
  dimsum__status_message("Processing...\n")
  #Trans library mode?
  if(dimsum_meta[["transLibrary"]]){
    for(i in 1:length(sample_names)){dimsum__status_message(paste0("\t", paste0(unlist(dimsum_meta[["exp_design"]][i,c('pair1', 'pair2')]), collapse = "\t"), "\n"))}
    if(execute){
      # Setup cluster
      clust <- parallel::makeCluster(dimsum_meta[['numCores']])
      # make variables available to each core's workspace
      parallel::clusterExport(clust, list("dimsum_meta","vsearch_outpath","sample_names","dimsum__concatenate_reads"), envir = environment())
      parallel::parSapply(clust,X = 1:length(sample_names), dimsum__vsearch_trans_library_helper)
      parallel::stopCluster(clust)
    }
  #Single-end mode?
  }else if(!dimsum_meta[["paired"]]){
    for(i in 1:length(sample_names)){dimsum__status_message(paste0("\t", paste0(unique(unlist(dimsum_meta[["exp_design"]][i,c('pair1', 'pair2')])), collapse = "\t"), "\n"))}
    if(execute){
      # Setup cluster
      clust <- parallel::makeCluster(dimsum_meta[['numCores']])
      # make variables available to each core's workspace
      parallel::clusterExport(clust, list("dimsum_meta","vsearch_outpath","sample_names","dimsum__filter_single_end_reads"), envir = environment())
      parallel::parSapply(clust,X = 1:length(sample_names), dimsum__vsearch_single_end_library_helper)
      parallel::stopCluster(clust)
    }
  #Classic paired-end mode
  }else{
    for(i in 1:length(sample_names)){
      dimsum__status_message(paste0("\t", paste0(unlist(dimsum_meta[["exp_design"]][i,c('pair1', 'pair2')]), collapse = "\t"), "\n"))
      #Check if this system command should be executed
      if(execute){
        temp_out <- system(paste0(
          "vsearch -fastq_mergepairs ",
          file.path(dimsum_meta[["exp_design"]][i,"pair_directory"], dimsum_meta[["exp_design"]][i,"pair1"]),
          " -reverse ",
          file.path(dimsum_meta[["exp_design"]][i,"pair_directory"], dimsum_meta[["exp_design"]][i,"pair2"]),
          " -fastqout - ",
          " -quiet ",
          # " -fastq_minqual ",
          # as.character(dimsum_meta[["vsearchMinQual"]]),
          " -fastq_maxee ",
          as.character(dimsum_meta[["vsearchMaxee"]]),
          " -fastq_minlen ",
          as.character(dimsum_meta[["cutadaptMinLength"]]),
          temp_options,
          " -threads ",
          dimsum_meta[['numCores']],
          " --fastq_allowmergestagger ",
          " --fastq_qmax ",
          as.character(dimsum_meta[["vsearchMaxQual"]]),
          " --fastq_qmaxout ",
          as.character(dimsum_meta[["vsearchMaxQual"]]),
          " 2> ",
          file.path(vsearch_outpath, paste0(sample_names[i], '.report.prefilter')),
          " | gzip > ",
          file.path(vsearch_outpath, paste0(sample_names[i], '.vsearch.prefilter.gz'))))
      }
    }
    dimsum__status_message("Filtering aligned reads...\n")
    if(execute){
      #Input files
      input_files <- c(
        file.path(vsearch_outpath, paste0(sample_names, '.vsearch.prefilter.gz')),
        file.path(vsearch_outpath, paste0(sample_names, '.report.prefilter')))
      #Check if all input files exist
      dimsum__check_files_exist(
        required_files = input_files,
        stage_number = this_stage,
        execute = execute)
      # Setup cluster
      clust <- parallel::makeCluster(dimsum_meta[['numCores']])
      # make variables available to each core's workspace
      parallel::clusterExport(clust, list("dimsum_meta","vsearch_outpath","sample_names","dimsum__filter_reads"), envir = environment())
      parallel::parSapply(clust,X = 1:length(sample_names), dimsum__filter_reads_helper)
      parallel::stopCluster(clust)
    }
  }
  #New experiment metadata
  dimsum_meta_new <- dimsum_meta
  #Merged fastq filenames
  dimsum_meta_new[["exp_design"]][,"aligned_pair"] <- paste0(sample_names, ".vsearch.gz")
  dimsum_meta_new[['exp_design']][,"aligned_pair_directory"] <- vsearch_outpath
  #Delete files when last stage complete
  if(!dimsum_meta_new[["retainIntermediateFiles"]]){
    if(dimsum_meta_new[["stopStage"]]==this_stage){
      if(!is.null(dimsum_meta_new[["deleteIntermediateFiles"]])){suppressWarnings(temp_out <- file.remove(dimsum_meta_new[["deleteIntermediateFiles"]]))}
      if(!is.null(dimsum_meta_new[["touchIntermediateFiles"]])){suppressWarnings(temp_out <- file.create(dimsum_meta_new[["touchIntermediateFiles"]]))}
    }else{
      dimsum_meta_new[["deleteIntermediateFiles"]] <- c(
        dimsum_meta_new[["deleteIntermediateFiles"]], 
        file.path(vsearch_outpath, dir(vsearch_outpath, "*.vsearch.gz$")),
        file.path(vsearch_outpath, dir(vsearch_outpath, "*.vsearch.prefilter.gz$")))
    }
  }
  #Generate vsearch report
  if(report){
    tryCatch({
      dimsum_meta_new_report <- dimsum__vsearch_report(dimsum_meta = dimsum_meta_new, report_outpath = report_outpath)
      }, error=function(e){
        dimsum__status_message("There were problems while running 'dimsum__vsearch_report'\n")
        dimsum_meta_new_report <- dimsum_meta_new
        })
    return(dimsum_meta_new_report)
  }
  return(dimsum_meta_new)
}
lehner-lab/DiMSum documentation built on April 10, 2024, 4:15 a.m.