R/inout_xlff_to_output.R

#' Read and wrangle excel-FastField and output dataframes or .csv/.xlsx files.
#'
#' These functions read and wrangle excel workbooks produced by the FastField
#' app and output a list of dataframes (`xlff_to_list()`), .csv files
#' (`xlff_to_csv()`) or .xlsx files (`xlff_to_xl()`). Each dataframe or file
#' combines all spreadsheets from a single excel workbook in the input
#' directory. If the input directory has multiple workbooks, the output will be
#' multiple dataframes, multiple .csv files, or multiple .excel files.
#'
#' This is a rigid function with a very specific goal: To process data from
#' FastField forms. Specifically, this is what this function does:
#' * Reads each spreadsheet from each workbook and map it to a dataframe.
#' * Lowercases and links the names of each dataframe.
#' * Adds any missing __key-sheets__:
#'     * For first census: (1) "root", (2) "multi_stems", (3) "secondary_stems",
#'     and (4) "single_stems".
#'     * For recensus: (1) "root", (2) "original_stems", (3)
#'     "new_secondary_stems", and (4) "recruits"
#' * Dates the data by `submission_id` (`date` comes from the spreadsheet
#' `root`).
#' * Lowercases and links the names of each dataframe-variable.
#' * Drops fake stems.
#' * Output a common data structure of your choice.
#'
#' @param dir_in String giving the directory containing the excel workbooks
#'   to read from.
#' @param dir_out String giving the directory where to write .csv files to.
#' @param first_census Use `TRUE` if this is your first census. Use `FALSE`
#'   (default) if this is not your first census but a recensus.
#' @param root_columns String. Lowercase name of column(s) in the root sheet (
#'   e.g. c("date", "team")). This is useful when you data has non-standard 
#'   columns.
#'
#' @return `xlff_to_csv()` and `xlff_to_xl()` write a .csv or excel (.xlsx) file
#'   per workbook -- combining all spreadsheets. `xlff_to_list` outputs a list
#'   where each dataframes combines all spreadsheeets of a workbook.
#'
#' @author Mauro Lepore and Jessica Shue.
#'
#' @section Acknowledgment:
#' * Sabrina Russo helped to make these functions useful with first censuses.
#' * David Orwig helped to fix a debug.
#'
#' @examples
#' library(fs)
#' library(readr)
#' library(readxl)
#'
#' # NOT A FIRST CENSUS
#' # Path to the folder I want to read excel files from
#' dir_in <- dirname(misc_example("two_files/new_stem_1.xlsx"))
#' dir_in
#'
#' # Files I want to read
#' dir(dir_in, pattern = "xlsx")
#'
#' # Path to the folder I want to write .csv files to
#' dir_out <- tempdir()
#'
#' # Output a csv file
#' xlff_to_csv(dir_in, dir_out)
#'
#' # Confirm
#' path_file(dir_ls(dir_out, regexp = "new_stem.*csv$"))
#'
#' # Also possible to output excel and a list of dataframe. See next section.
#'
#' # FIRST CENSUS
#' dir_in <- dirname(misc_example("first_census/census.xlsx"))
#' # As a reminder you will get a warning of missing sheets
#' # Output list of dataframes (one per input workbook -- here only one)
#' lst <- xlff_to_list(dir_in, first_census = TRUE)
#' str(lst, give.attr = FALSE)
#'
#' # Output excel
#' xlff_to_xl(dir_in, dir_out, first_census = TRUE)
#' # Read back
#' filename <- path(dir_out, "census.xlsx")
#' out <- read_excel(filename)
#' str(out, give.attr = FALSE)
#' @name xlff_to_output
NULL

xlff_to_file <- function(ext, fun_write, root_columns = NULL) {
  function(dir_in, dir_out = "./", first_census = FALSE) {
    check_dir_out(dir_out = dir_out, print_as = "`dir_out`")
    
    lst <- xlff_to_list(
      dir_in = dir_in, 
      first_census = first_census,
      root_columns = get_root_cols(root_columns)
    )
    
    files <- fs::path_ext_remove(names(lst))
    paths <- fs::path(dir_out, purrr::map(files, ~ fs::path_ext_set(.x, ext)))
    purrr::walk2(lst, paths, fun_write)
  }
}

get_root_cols <- function(x) {
  if (is.null(x)) return("date")
  c("date", tolower(x))
}

#' @export
#' @rdname xlff_to_output
xlff_to_csv <- xlff_to_file("csv", readr::write_csv)

#' @export
#' @rdname xlff_to_output
xlff_to_xl <- xlff_to_file("xlsx", writexl::write_xlsx)

#' @export
#' @rdname xlff_to_output
xlff_to_list <- function(dir_in, first_census = FALSE, root_columns = NULL) {
  check_dir_in(dir_in = dir_in, print_as = "`dir_in`")
  out <- purrr::map(
    xl_workbooks_to_chr(dir_in),
    xlff_to_list_each, 
    first_census = first_census, 
    root_columns = get_root_cols(root_columns)
  )
  rlang::set_names(out, basename(names(out)))
}

#' @noRd
xlff_to_list_each <- function(file, 
                              first_census = FALSE, 
                              root_columns) {
  dfm_list <- nms_tidy(xlsheets_list(file))
  
  if (first_census) {
    key <- key_first_census()
    dfm_list <- ensure_key_sheets(dfm_list, key)
  } else {
    key <- key_recensus()
    dfm_list <- ensure_key_sheets(dfm_list, key)
  }
  
  if (!all(root_columns %in% tolower(names(dfm_list$root)))) {
    rlang::abort("`root_columns` must be names of the root sheet.")
  }

  # Piping functions to avoid useless intermediate variables
  clean_dfm_list <- dfm_list %>%
    purrr::keep(~!purrr::is_empty(.)) %>%
    lapply(nms_tidy) %>%
    drop_fake_stems()

  # After dropping fake stems new_secondary_stems might be empty (0-row)
  purrr::walk(names(clean_dfm_list), ~warn_if_empty(clean_dfm_list, .x))

  # Sanitize
  sane <- clean_dfm_list %>%
    # Avoid error in naming cero-row dataframes
    warn_if_filling_cero_row_dataframe() %>%
    purrr::modify_if(~nrow(.x) == 0, ~purrr::map_df(.x, ~NA)) %>%
    name_dfs(name = "sheet") %>%
    # Avoid merge errors
    coerce_as_character()
  
  add_root_columns <- function(x, root_columns) {
    purrr::map_df(root_columns, ~add_root_column(x, root_column = .x))
  }
  with_date <- tibble::as_tibble(
    add_root_columns(sane, root_columns = root_columns)
  )
  
  # In columns matching "codes", replace commas by semicolon
  .df <- purrr::modify_if(
    with_date, grepl("codes", names(with_date)), ~gsub(",", ";", .x)
  )
  .df
}

#' Check that key spreadsheets exist.
#' @noRd
ensure_key_sheets <- function(x, key) {
  missing_key_sheet <- !all(names(key) %in% names(x))
  if (missing_key_sheet) {
    missing_sheets <- setdiff(names(key), names(x))
    msg <- paste0(
      "Adding missing sheets: ", commas(missing_sheets), "."
    )
    warn(msg)
    
    missing_appendix <- purrr::map(key[missing_sheets], str_df)
    x <- append(x, missing_appendix)
  }
  x
}



#' Remove rows equal to cero from the spreadsheet sheet new_secondary_stem.
#' @noRd
drop_fake_stems <- function(.df) {
  dropped <- purrr::modify_at(
    .df, .at = "new_secondary_stems", ~.x[.x$new_stem != 0, ]
  )
  dropped
}

#' Warns if a dataframe in a list of dataframes has empty rows.
#' @noRd
warn_if_empty <- function(.x, dfm_nm) {
  dfm <- .x[[dfm_nm]]

  if (is.null(dfm)) {
    warn(paste("`.x` has no dataframe", dfm_nm), ". Is this intentional?")
    return(invisible(.x))
  }

  has_cero_rows <- nrow(dfm) == 0
  if (has_cero_rows) {
    warn(paste0("`", dfm_nm, "`", " has cero rows."))
  }
  invisible(.x)
}

warn_if_filling_cero_row_dataframe <- function(lst) {
  cero_row_dfs <- purrr::keep(lst, ~nrow(.x) == 0)
  if (length(cero_row_dfs) != 0) {
    warning(
      "Filling every cero-row dataframe with NAs (",
      commas(names(cero_row_dfs)), ").",
      call. = FALSE
    )
  }
  invisible(lst)
}

coerce_as_character <- function(.x, ...) {
  purrr::map(.x, ~purrr::modify(., .f = as.character, ...))
}

add_root_column <- function(x, root_column) {
  x_ <- list_df(discard_root(x))
  y <- x[["root"]][c("submission_id", root_column)]
  if (rlang::has_name(x_, "tag")) {
    return(
      add_values_to_unique_stems(x_, y, prefix = paste0(x_$tag, "_"))
    )
  }
  add_values_to_unique_stems(x_, y, prefix = "notag_")
}

discard_root <- function(x) {
  purrr::discard(x, grepl("root", names(x)))
}

add_values_to_unique_stems <- function(x, y, prefix) {
  x_ <- dplyr::mutate(x, unique_stem = paste0(prefix, .data$stem_tag))
  dplyr::left_join(x_, y, by = "submission_id")
}

check_dir_in <- function(dir_in, print_as) {
  stopifnot(is.character(dir_in))
  validate_dir(dir_in, "`dir_in`")
  msg <- "`dir_in` must contain at least one excel file."
  file_names <- xl_workbooks_to_chr(dir_in)
  if (length(file_names) == 0) {
    abort(msg)
  }
  invisible()
}

check_dir_out <- function(dir_out, print_as) {
  stopifnot(is.character(dir_out))
  validate_dir(dir_out, "`dir_out`")
  invisible()
}

validate_dir <- function(dir, dir_name) {
  invalid_dir <- !fs::dir_exists(dir)
  if (invalid_dir) {
    msg <- paste0(
      dir_name, " must match a valid directory.\n",
      "bad ", dir_name, ": ", "'", dir, "'"
    )
    abort(msg)
  } else {
    invisible(dir)
  }
}

xl_workbooks_to_chr <- function(dir_in) {
  fs::dir_ls(dir_in, regexp = "\\.xls")
}

commas <- function(...) paste0(..., collapse = ", ")


each_list_item_is_df <- function(x) {
  if (!is.list(x) || is.data.frame(x)) {
    abort("`x` must be a list of datafraems (and not itself a dataframe).")
  }
  all(vapply(x, has_class_df, logical(1)))
}

has_class_df <- function(x) {
  any(grepl("data.frame", class(x)))
}


#' Path to directory containing example data.
#'
#' @param path Path to a file (with extension) from inst/extdata/.
#'
#' @return String (a path)
#' @export
#'
#' @examples
#' misc_example("xl")
#' dir(misc_example("xl"))
misc_example <- function(path = NULL) {
  if (is.null(path)) {
    dir(system.file("extdata", package = "fgeo.misc"))
  } else {
    system.file("extdata", path, package = "fgeo.misc", mustWork = TRUE)
  }
}

nms_tidy <- function(x) {
  if (rlang::is_named(x)) {
    names(x) <- gsub(" ", "_", tolower(names(x)))
    return(x)
  }
  
  gsub(" ", "_", tolower(x))
}
forestgeo/fgeo.misc documentation built on June 23, 2019, 6:26 p.m.