R/read_dir.R

Defines functions read_dir

Documented in read_dir

#' read a directory of tsv files
#'
#' Given a directory of `tsv` files, `read_dir()` will read in each file and
#' merge them into a single matrix. A \code{\link[Matrix]{dgCMatrix-class}}
#' matrix will be returned if the directory is named `genefamilies_relab` and a
#' \code{\link[base]{matrix}} will be returned in all other cases. In either
#' case, the matrix will be returned as a named \code{\link[base]{list}} element
#' with the correct name to be used when uploading to AWS S3.
#'
#' @param dir_path path to a directory of `tsv` files generated by the high load
#' pipeline
#'
#' @return A single-element named \code{\link[base]{list}} (see above) with a
#' \code{\link[Matrix]{dgCMatrix-class}} matrix or a \code{\link[base]{matrix}}.
#' @export
#'
#' @seealso [save_rda()]
#'
#' @examples \dontrun{
#'
#' read_dir("~/AsnicarF_2017/genefamilies_relab/")
#' }
#'
#' @importFrom magrittr %>%
#' @importFrom stringr str_replace
#' @importFrom stringr str_c
#' @importFrom purrr map
#' @importFrom purrr set_names
#' @importFrom stringr str_remove
#' @importFrom purrr map_at
#' @importFrom purrr imap
#' @importFrom readr read_tsv
#' @importFrom purrr map_if
#' @importFrom purrr reduce
#' @importFrom dplyr full_join
#' @importFrom tibble column_to_rownames
#' @importFrom dplyr mutate
#' @importFrom dplyr across
#' @importFrom tidyr replace_na
#' @importFrom methods as
#' @importClassesFrom Matrix dgCMatrix
read_dir <- function(dir_path) {
    study_name <-
        base::dirname(dir_path) %>%
        base::basename()

    data_type <-
        base::basename(dir_path) %>%
        stringr::str_replace("genefamilies_relab", "gene_families") %>%
        stringr::str_replace("metaphlan_bugs_list", "relative_abundance") %>%
        stringr::str_replace("pathabundance_relab", "pathway_abundance") %>%
        stringr::str_replace("pathcoverage", "pathway_coverage")

    aws_name <-
        base::Sys.Date() %>%
        stringr::str_c(study_name, data_type, sep = ".")

    purrr::map(dir_path, ~ base::dir(path = .x, pattern = ".tsv", full.names = TRUE)) %>%
        purrr::set_names(nm = data_type) %>%
        purrr::map(~ purrr::set_names(.x, ~ base::basename(.x))) %>%
        purrr::map(~ purrr::set_names(.x, ~ stringr::str_remove(.x, ".tsv"))) %>%
        purrr::map_at("relative_abundance", ~ purrr::imap(.x, ~ readr::read_tsv(.x, col_names = base::c("rowname", .y), col_types = "c-d-", comment = "#", progress = FALSE))) %>%
        purrr::map_if(base::is.character, ~ purrr::imap(.x, ~ readr::read_tsv(.x, col_names = base::c("rowname", .y), col_types = "cd", comment = "#", progress = FALSE))) %>%
        purrr::map(~ purrr::reduce(.x, dplyr::full_join, by = "rowname")) %>%
        purrr::map(~ tibble::column_to_rownames(.x)) %>%
        purrr::map(~ dplyr::mutate(.x, dplyr::across(.fns = ~ tidyr::replace_na(.x, 0)))) %>%
        purrr::map(~ base::as.matrix(.x)) %>%
        purrr::map_at("gene_families", ~ methods::as(.x, "dgCMatrix")) %>%
        purrr::set_names(nm = aws_name)
}
waldronlab/curatedMetagenomicDataPipeline documentation built on Oct. 15, 2021, 6:38 a.m.