R/parsing_raw_data.R

Defines functions merging_by_sample extract_checkm extract_bbtools extract_quast read_tsv_report find_tool_reports get_run_date

Documented in extract_bbtools extract_checkm extract_quast find_tool_reports get_run_date merging_by_sample read_tsv_report

### Functions to get run info -------------------------------------------------

#' Get run date from iRODS collection name of Juno output.
#'
#' @param input_dir Collection name of Juno output as assigned by iRODS in
#'   previous pipeline. It MUST start with the date in the format "yymmdd".
#'
#' @return String with date-like format (yy-mm-dd).
#' @export
#'
#' @importFrom dplyr %>%
#'
#' @examples get_run_date("juno_results/200922_NB502001_0153_AHGNLNAFX2_0007")
get_run_date <- function(input_dir){
  stopifnot( stringr::str_detect(input_dir, "\\d{6}") )

  if ( stringr::str_detect(input_dir, "/") ){
    input_dir <- basename(input_dir)
  }

  date_dir <- input_dir %>%
    stringr::str_extract("\\d{6}") %>%
    purrr::map_chr(stringr::str_replace,
                   "(\\d{2})(\\d{2})(\\d{2})$","\\1-\\2-\\3") %>%
    as.Date( , format = "%y-%m-%d")

  if( difftime(as.Date(date_dir, format = "%y-%m-%d"), Sys.Date(), units = "days") %>%
      as.numeric() %>% abs() > 3650) {
    warning("The date is too far in the past or in the future.
            This might be an error in your folder name.
            Make sure that the date is included in the folder name in the format 'yymmdd'.")
  }

  date_dir

}

### Functions to get and parse tool reports -----------------------------------

#' Find tool reports in Juno results.
#'
#' @param input_dir Path to directory with results from Juno pipeline.
#' @param tool c("quast", "bbtools", "checkm").
#'
#' @return The file path(s) of the desired report (tool) type in the input
#'   directory.
#'
#' @importFrom dplyr %>%
#' @importFrom stringr str_detect
#'
find_tool_reports <- function(input_dir=character(),
                              tool=c("quast", "bbtools", "checkm")){

  stopifnot( dir.exists(input_dir) )

  pattern_name <- "_report.tsv"

  if (tool == "quast"){
    pattern_path <- "quast|QUAST"
  } else if (tool == "checkm"){
    pattern_path <- "checkm|CheckM|CHECKM"
  } else if (tool == "bbtools"){
    pattern_path <- "bbtools"
  } else {
    stop("argument tool must be either 'quast', 'bbtools' or 'checkm'")
  }

  list.files( input_dir,  pattern = pattern_name, full.names = T, recursive = T) %>%
    `[`(stringr::str_detect(., pattern_path )) %>%
    `[`(! stringr::str_detect(., "per_sample"))
}



#' Read tsv report
#'
#' @param file_path Path to tsv report.
#'
#' @return Tibble containing the tsv report
#'
#' @import dplyr
#' @importFrom purrr map
#'
#'
read_tsv_report <- function(file_path){
  stopifnot(file.exists(file_path))

  if( length(file_path) > 1 ){
    warning("The input directory contains more than one report.tsv.
            The different report.tsv files will be merged in one
            single dataframe.")
    report <- suppressWarnings(suppressMessages(
      purrr::map(file_path, readr::read_tsv, col_names = TRUE) )) %>%
      dplyr::bind_rows()
  } else{
    report <- suppressWarnings( suppressMessages(readr::read_tsv(file_path, col_names = TRUE) ))
  }

  return(report)

}


#' Read information from QUAST report
#'
#' @param input_dir Path to directory with results from Juno pipeline.
#'
#' @return Tidy data frame containing information from QUAST report to be used
#'   for this QC report
#' @export
#'
#' @import dplyr
#'
extract_quast <- function(input_dir = character()){
  file_path <- find_tool_reports(input_dir, "quast")
  read_tsv_report(file_path) %>%
    dplyr::select("Assembly", "# contigs", "Total length", "GC (%)", "N50", "L50") %>%
    dplyr::rename("Sample" = "Assembly")
}

#' Read information from bbtools report
#'
#' @param input_dir Path to directory with results from Juno pipeline.
#'
#' @return Tidy data frame containing information from bbtools report to be used
#'   for this QC report.
#' @export
#'
#' @import dplyr
#'
extract_bbtools <- function(input_dir = character()){
  file_path <- find_tool_reports(input_dir, "bbtools")
  read_tsv_report(file_path) %>%
    dplyr::select("Sample",	"Percent mapped",	"Average coverage",
                  #"Reads", "Mapped reads", "Mapped bases", "Ref scaffolds", "Ref bases",	"Percent proper pairs",
                  "Percent of reference bases covered")
}

#' Read information from CheckM report
#'
#' @param input_dir Path to directory with results from Juno pipeline.
#'
#' @return Tidy data frame containing information from CheckM report to be used
#'   for this QC report
#' @export
#'
#' @import dplyr
#'
extract_checkm <- function(input_dir = character()){
  file_path <- find_tool_reports(input_dir, "checkm")
  read_tsv_report(file_path) %>%
    dplyr::rename( "Sample" = "sample" ) %>%
    dplyr::select("Sample", "completeness", "contamination",	"strain_heterogeneity") %>%
    dplyr::mutate("Sample" = stringr::str_remove(Sample, "L$"))
}



### Functions to merge datasets -----------------------------------------------

#' Merge datasets by sample
#'
#' @param vector_w_dataframes Character vector with the name of the datasets to
#'   merge. Each dataset MUST contain at least one column called "Sample" that
#'   is common between them.
#'
#' @param run_date Character vector with the date in the format "yy-mm-dd" as
#'   given by refsamp::get_run_date()
#'
#' @return Tibble with the different datasets provided in vector_w_dataframes
#'   bound by the 'Sample' column.
#' @export
#'
#' @importFrom purrr map
#' @import dplyr
#'
merging_by_sample <- function(vector_w_dataframes, run_date){
  stopifnot(is.character(vector_w_dataframes))
  stopifnot(class(run_date) == "Date")
  stopifnot(length(vector_w_dataframes) >= 2)

  # Read datasets and make sure they have a "Sample" column
  datasets_to_merge <- vector_w_dataframes %>%
    purrr::map(as.name) %>%
    purrr::map(eval)

  # Merge datasets by Sample
  merged_dataset <- datasets_to_merge[[1]]
  for(i in 2:length(datasets_to_merge)){
    merged_dataset <- dplyr::full_join(merged_dataset, datasets_to_merge[[i]], by = "Sample")
  }

  # Add genus name and run_date
  merged_dataset <- merged_dataset %>%
    left_join(genera_criteria[c("Sample", "Genus")], by = "Sample")  %>%
    mutate("Run_date" = run_date)

  return(merged_dataset)

}
AleSR13/refsampr documentation built on May 30, 2022, 5:42 a.m.