R/MergeResults.R

Defines functions preMergeResultsFiles

Documented in preMergeResultsFiles

#' Premerge results files
#'
#' @description
#' If there are many results files, starting the Shiny app may take a very long time. This function
#' already does most of the preprocessing, increasing loading speed.
#' The merged data will be stored in the same folder, and will automatically be recognized by the
#' Shiny app.
#'
#' @param dataFolder   folder where the exported results zip files are stored. Zip files containing
#'                     results from multiple databases can be placed in the same folder.
#'
#' @export
preMergeResultsFiles <- function(dataFolder) {
  zipFiles <- list.files(dataFolder, pattern = ".zip", full.names = TRUE)

  loadFile <- function(file, folder, overwrite) {
    # print(file)
    tableName <- gsub(".csv$", "", file)
    camelCaseName <- SqlRender::snakeCaseToCamelCase(tableName)
    data <- readr::read_csv(file.path(folder, file),
                            col_types = readr::cols(),
                            guess_max = 1e+07,
                            locale = readr::locale(encoding = "UTF-8"), 
                            lazy = FALSE)
    colnames(data) <- SqlRender::snakeCaseToCamelCase(colnames(data))

    if (!overwrite && exists(camelCaseName, envir = .GlobalEnv)) {
      existingData <- get(camelCaseName, envir = .GlobalEnv)
      if (nrow(existingData) > 0) {
        if (nrow(data) > 0) {
          if (all(colnames(existingData) %in% colnames(data)) && all(colnames(data) %in% colnames(existingData))) {
          data <- data[, colnames(existingData)]
          } else {
          stop("Table columns do no match previously seen columns. Columns in ",
               file,
               ":\n",
               paste(colnames(data), collapse = ", "),
               "\nPrevious columns:\n",
               paste(colnames(existingData), collapse = ", "))

          }
        }
      }
      data <- rbind(existingData, data)
    }
    assign(camelCaseName, data, envir = .GlobalEnv)

    invisible(NULL)
  }
  tableNames <- c()
  for (i in 1:length(zipFiles)) {
    writeLines(paste("Processing", zipFiles[i]))
    tempFolder <- tempfile()
    dir.create(tempFolder)
    unzip(zipFiles[i], exdir = tempFolder)

    csvFiles <- list.files(tempFolder, pattern = ".csv")
    tableNames <- c(tableNames, csvFiles)
    lapply(csvFiles, loadFile, folder = tempFolder, overwrite = (i == 1))

    unlink(tempFolder, recursive = TRUE)
  }

  tableNames <- unique(tableNames)
  tableNames <- gsub(".csv$", "", tableNames)
  tableNames <- SqlRender::snakeCaseToCamelCase(tableNames)
  save(list = tableNames, file = file.path(dataFolder, "PreMerged.RData"), compress = TRUE)
  ParallelLogger::logInfo("Merged data saved in ", file.path(dataFolder, "PreMerged.RData"))
}
ohdsi-studies/Covid19VaccineAesiIncidenceRate documentation built on Dec. 22, 2021, 4:16 a.m.