R/clean_data.R

Defines functions clean_data

Documented in clean_data

#' Clean data for the KOMODO2 workflow
#'
#' This script implements the second step of the LCFD workflow of KOMODO2.
#' It is responsible for dealing with data inconsistencies, including
#' missing values, outliers and undesired characteres, as well as data
#' merging. It also preprocesses data to allow for more flexible inputs form
#' the user, such as automatically converting common annotation output to a
#' single standard format.
#'
#' The script expects enriched `KOMODO2`-type lists, which are generated by
#' [load_data()].
#'
#'
#' @param defs an enriched KOMODO2-type list object (see Details).
#'
#' @return updated \code{defs} list containing information from parsed
#' genome maps (e.g., for test and back genomes if `type == "significance"`)
#'
#' @importFrom assertthat assert_that
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Build an input list:
#' fpath1 <- system.file("extdata", "gene2GO", package="KOMODO2")
#' fpath2 <- system.file("extdata", "metadata/GO_metadata_Pan_proxy.txt", package="KOMODO2")
#' fpath3 <- system.file("extdata", "trees/tree_genome_IDs.nwk", package="KOMODO2")
#'
#' defs <- list(annotation_files_dir = fpath1,
#'              output_dir = "./results/GO_Pan_proxy/",
#'              dataset.info = fpath2,
#'              x.column = 2,
#'              ontology = "GO",
#'              dict.path = "",
#'              column = "GO",
#'              denominator.column = "",
#'              tree_path = fpath3,
#'              tree_type = "newick",
#'              linear_model_cutoff = 0.5,
#'              type = "correlation")
#'
#' defs <- load_data(defs, cores = 2)
#' defs <- clean_data(defs)
#' }

clean_data <- function(defs){

  # ================== Sanity checks ==================
  assert_that(all(c("list", "KOMODO2") %in% class(defs)))

  # Perform data preprocessing
  cat("\nPreliminary data cleaning:\n")
  defs <- switch(tolower(defs$type),
                 significance = clean_data_significance(defs),
                 correlation  = clean_data_correlation(defs))


  if (defs$ontology == "other") {
    assert_that(!is.null(defs$dictionary))

    # Convert to a named list
    defs$dictionary  <- unique(defs$dictionary)
    defs$temp        <- as.list(defs$dictionary[, 2])
    names(defs$temp) <- defs$dictionary[, 1]
    defs$dictionary  <- defs$temp
    defs$temp        <- NULL
  }

  return(defs)
}
fcampelo/KOMODO2-CRAN documentation built on March 7, 2020, 6:35 a.m.