R/clean_data.R
In CALANGO: Comparative Analysis with Annotation-Based Genomic Components

# Clean data for the CALANGO workflow
#
# This script implements the second step of the LCFD workflow of CALANGO.
# It is responsible for dealing with data inconsistencies, including
# missing values, outliers and undesired characteres, as well as data
# merging. It also preprocesses data to allow for more flexible inputs form
# the user, such as automatically converting common annotation output to a
# single standard format.
#
# The script expects enriched `CALANGO`-type lists, which are generated by
# [load_data()].
#
#
# @param defs an enriched CALANGO-type list object (see Details).
#
# @return updated \code{defs} list containing information from parsed
# genome maps (e.g., for test and back genomes if `type == "significance"`)
#
#

clean_data <- function(defs){

  # Perform data preprocessing
  message("Preliminary data cleaning:")
  defs <- switch(tolower(defs$type),
                 significance = clean_data_significance(defs),
                 correlation  = clean_data_correlation(defs))


  if (defs$ontology == "other") {
    assertthat::assert_that(!is.null(defs$dictionary))

    # Convert to a named list
    defs$dictionary  <- unique(defs$dictionary)
    defs$temp        <- as.list(defs$dictionary[, 2])
    names(defs$temp) <- defs$dictionary[, 1]
    defs$dictionary  <- defs$temp
    defs$temp        <- NULL
  }

  return(defs)
}