R/functions.R

Defines functions write_data_raw write_id write_to_file summarise_cat summarise_cts process_var merge_var filter_data count_inclu process_inclu merge_inclu merge_data compare_key_short extract_key extract_key_long extract_selection_logic extract_selection check_long extract_key_short get_setting database_full check_type check_input unzip fail_error success_msg time_to_char factor_to_char rename_cols find_common replace_key_var

Documented in check_input check_long check_type compare_key_short count_inclu database_full extract_key extract_key_long extract_key_short extract_selection extract_selection_logic factor_to_char fail_error filter_data find_common get_setting merge_data merge_inclu merge_var process_inclu process_var rename_cols replace_key_var success_msg summarise_cat summarise_cts time_to_char unzip write_data_raw write_id write_to_file

# Generic helper functions ------------------------------------------------

#' Replaces `*` in a string with digits 0 to 9
#' @param s A string.
#' @import stringr
replace_key_var <- function(s) {
  if (!str_detect(string = s, pattern = "[*]")) {
    return(s)
  } else {
    str_replace(string = s, pattern = "[*]", replacement = as.character(c(0:9)))
  }
}
#' Find common strings in all elements of a list of strings
#' @param s_list A list of strings.
find_common <- function(s_list) {
  s_all <- as.character(unlist(s_list))
  s_count <- as.data.frame(table(s_all), stringsAsFactors = FALSE)
  if (!any(s_count$Freq == length(s_list))) {
    list(all = sort(unique(s_all)), common = NULL)
  } else {
    list(all = sort(unique(s_all)),
         common = sort(s_count$s_all[s_count$Freq == length(s_list)]))
  }
}
#' Rename column names
#' @description  Rename non-key variables in a list of variable names if they
#'   appear multiple times in the list of variable names.
#' @param name_list A list of variable names.
#' @param all_list A larger list of variable names.
#' @param key_list A list of identifier names. These should not be renamed.
rename_cols <- function(name_list, all_list = name_list, key_list) {
  name_vec <- unlist(all_list)
  tb <- as.data.frame(table(name_vec))
  names_rep <- tb$name_vec[tb$Freq > 1]
  # names_rep <- setdiff(names_rep, unlist(key_list))
  lapply(1:length(name_list), function(i) {
    nm <- name_list[[i]]
    names_rep_i <- setdiff(names_rep, key_list[[i]])
    i_rep <- which(nm %in% names_rep_i)
    nm[i_rep] <- paste(nm[i_rep], i, sep = ".")
    nm
  })
}
#' Converts factor columns to character
#' @param data A data frame.
#' @details This is needed because \code{ore.pull} automatically converts
#'   character columns to factors.
factor_to_char <- function(data) {
  data[] <- lapply(data, function(x) {
    if (is.factor(x)) {
      as.character(x)
    } else {
      x
    }
  })
}
#' Convert time to character
#' @inheritParams factor_to_char
#' @details This is because \code{ore.pull} automatically add time zone of Date
#'   type columns.
time_to_char <- function(data) {
  data[] <- lapply(data, function(x) {
    if (inherits(x,c("POSIXt","POSIXct"))) {
      as.character(x)
    } else {
      x
    }
  })
}

#' Flag message for successfully loading a table
#' @param is_database Whether table is in a database.
#' @inheritParams access_bridge
success_msg <- function(table_name, database, is_database = TRUE) {
  if (is_database) {
    message(simpleMessage(
      paste("Table", table_name, "is extracted from database", database, "\n")
    ))
  } else {
    message(simpleMessage(
      paste("Table", table_name, "is loaded\n")
    ))
  }
}
#' Flag error for failing to find a table
#' @inheritParams access_bridge
#' @param is_database Whether table is in a database.
fail_error <- function(table_name, database, is_database = TRUE) {
  if (is_database) {
    stop(simpleError(
      paste("Failed to find table", table_name, "in database", database, "\n")
    ))
  } else {
    stop(simpleError(
      paste("Failed to find table", table_name, "in", database, "folder\n")
    ))
  }
}

# Check input -------------------------------------------------------------

#' Converts a comma-seperated string into a vector
#' @param string A comma-separated string.
unzip <- function(string) {
  if (is.character(string) & length(grep(string, pattern = ","))) {
    unlist(strsplit(x = string, split = ","))
  } else {
    string
  }
}
#' Check validity of input for the 3 main functions
#' @param ... Any input.
#' @details This function is needed because default values of input is
#'   overwritten to \code{""} by Rcmdr.
#' @examples {
#' RDataXMan:::check_input(NA)
#' RDataXMan:::check_input(NULL)
#' RDataXMan:::check_input(NULL, NULL)
#' RDataXMan:::check_input(NA, NULL)
#' RDataXMan:::check_input(NULL, NA)
#' RDataXMan:::check_input(NULL, "a")
#' RDataXMan:::check_input("a", NULL)
#' RDataXMan:::check_input(c("a", NULL, NA, "b", ""))
#' }
check_input <- function(...) {
  args <- list(...)
  v <- args[[1]]
  if (is.null(v)) {
    NA
  } else if (all(is.na(v))) {
    NA
  } else if (all(v == "")) {
    NA
  } else {
    i <- which(v %in% c("", " "))
    v[i] <- NA
    as.character(na.omit(v))
  }
}
#' Check data.type and table_name
#' @inheritParams genVariable
check_type <- function(table_name, data.type, database) {
  file_vec <- unlist(strsplit(table_name, split = "\\."))
  table_file <- file.path(database, table_name)
  if (file.exists(table_file)) {
    # Input is flat table.
    if (is.na(data.type)) {
      # In data.type is not specified
      data.type <- file_vec[length(file_vec)]
    } else {
      # Check if data.type is consistent
      if (data.type != tolower(file_vec[length(file_vec)])) {
        warning(simpleWarning(
          paste0("-- `data.type` specified is ", data.type,
                 ", yet data type inferred from `table_name` is",
                 tolower(file_vec[length(file_vec)]), ".",
                 " Using data type inferred from `table_name` instead.")
        ))
      }
      data.type <- tolower(file_vec[length(file_vec)])
    }
  } else {
    # Data is flat table without extension, or database
    # data.type must not be NA
    if (is.na(data.type)) {
      stop(simpleError("Please either specify `data.type`, or include file extension in `table_name` if data is stored in a flat table."))
    } else {
      if (!data.type %in% c("sql")) {
        # Data is flat table
        table_name <- paste(table_name, data.type, sep = ".")
      }
    }
    list(table_name = table_name, data.type = data.type)
  }
}

#' Make the database to a full database
#' @inheritParams genVariable
database_full <- function(research.folder, database, data.type) {
  # Check database
  if (is.na(database)) {
    warning(simpleWarning("Value for `database` is neither `public` nor `private`. Interpreted as `public` by default."))
    database <- c("public_data", "public_data")
  } else if (tolower(data.type) %in% c("sql")) {
    database <- c(database,"public_data")
  }else if (database == "private") {
    database <- c(file.path("research", research.folder, "private_data"),
                  file.path("research", research.folder, "private_data"))
  }else {
    database <- c("public_data","public_data")
  }
  database
}

# Read setting list -------------------------------------------------------

#' Extract information from inclusion setting list
#' @inheritParams genVariable
#' @param file File name of inclusion setting list.
#' @import xlsx
get_setting <- function(research.folder, file) {
  setting_list <- read.xlsx(file.path("research", research.folder,
                                      "request_input", file),
                            sheetName = "setting list", stringsAsFactors = FALSE)
  # Cannot use readxl because when there is empty cell under Sepcification,
  # readxl reads the sheet as a list instead of a table, and this upsets the
  # downstream procedure
  #setting_list <- as.data.frame(setting_list)
  setting_list[, 2] <- as.character(setting_list[, 2])
  if ("key.var" %in% setting_list[, 1]) {
    key.var <- setting_list[which(setting_list[, 1] == "key.var"), 2]
    key.desc <- unzip(setting_list[which(setting_list[, 1] == "key.desc"), 2])
  } else {
    key.var <- NULL
    key.desc <- NULL
  }
  list(
    table_name = setting_list[which(setting_list[, 1] == "table_name"), 2],
    key.var = key.var,
    key.desc = key.desc,
    identifier_var = unzip(
      setting_list[which(setting_list[, 1] == "identifier.var"), 2]
    ),
    data.type = setting_list[which(setting_list[, 1] == "data.type"), 2],
    conn_string = setting_list[which(setting_list[, 1] == "conn_string"), 2],
    database = setting_list[which(setting_list[, 1] == "database"), 2]
  )
}

# Extract key for inclusion -----------------------------------------------

#' Read in inclusion in short format
#' @inheritParams get_setting
#' @inheritParams genInclusion
#' @import readxl
#' @importFrom stringr str_detect
extract_key_short <- function(research.folder, file, key.var, key.desc = NA) {
  request <- read_excel(file.path("research", research.folder, "request_input",
                                  file),
                        sheet = "overall list")
  request <- as.data.frame(request)
  if (!(key.var %in% colnames(request))) {
    stop(simpleError(
      paste(key.var, "is not consistent in the request inclusion criteria.",
            "The first column that is not `sno` is treated as key variable.")
    ))
  }
  # Expand the incomplete key variable and return
  key <- request[, key.var]
  key <- as.vector(unlist(sapply(key, function(k) replace_key_var(k))))
  if (!is.na(key.desc)) {
    desc <- request[, key.desc]
  } else {
    desc <- NULL
  }
  list(key = key, desc = desc,
       is_expanded = any(str_detect(string = request[, key.var], pattern = "[*]")))
}
#' Validate request form in long format
#' @inheritParams genInclusion
#' @param request_file Inclusion list.
#' @param template_file Inclusion template generated by \code{genInclusion}.
#' @details \code{request_file} is annotated inclusion list, and
#'   \code{template_file} is the one generated by \code{genInclusion}. Column
#'   names and \code{key.var} should be identical in the two files.
#' @import readxl
check_long <- function(research.folder, database, request_file, template_file) {
  request <- read_excel(file.path("research", research.folder, "request_input",
                                  request_file),
                        sheet = "overall list")
  # request <- read.xlsx(file.path("research", research.folder, "request_input",
  #                                request_file),
  #                      sheetName = "overall list", stringsAsFactors = FALSE)
  request <- as.data.frame(request)
  template <- read_excel(file.path(paste0(database, "_template"), template_file),
                         sheet = "overall list")
  # template <- read.xlsx(file.path(paste0(database, "_template"), template_file),
  #                       sheetName = "overall list", stringsAsFactors=FALSE)
  template <- as.data.frame(template)
  if (!(nrow(template) == nrow(request))) {
    stop(simpleError("The rows of the template and request excel files do not match up."))
  }
  if (!("selection" %in% colnames(request) & ("selection" %in% colnames(template)))) {
    stop(simpleError("Column `selection` is required in both template and request form to indicate user selection."))
  }
  template <- template[order(template[, 1]), ]
  request <- request[order(request[, 1]), ]
  if (!all(template[, 1] == request[, 1])) {
    stop(simpleError("The first column of the template and request excel files does not match up."))
  } else {
    message(simpleMessage("  Request form is consistent with template.\n"))
  }
  request
}
#' Extract inclusion criteria based on selection symbols
#' @param selection The column for selection.
#' @param selection_symbol Symbol to indicate selection.
#' @inheritParams genInclusion
extract_selection <- function(selection, selection_symbol, key.var) {
  selection <- tolower(as.character(selection))
  if (!all(na.omit(selection) %in% selection_symbol)) {
    stop(simpleError(
      paste("Please make selection by putting", toString(selection_symbol),
            "in column `selection` of `overall list`.\n")
    ))
  }
  key.var[selection %in% selection_symbol]
}
#' @describeIn extract_selection Extract selection based on logit.
#' @param logic The column for logic.
#' @param request Data read from request file.
#' @inheritParams genInclusion
extract_selection_logic <- function(logic, request, key.var) {
  logic <- paste0(as.character(na.omit(logic)))
  if (length(logic) > 1) {
    stop(simpleError("Please write logical statement in one row."))
  } else {
    sttm <- logic
    subset(request, subset = eval(parse(text = sttm)))[, key.var]
  }
}
#' Read in inclusion in long format or variable list (in this case key.var is
#' variable)
#' @inheritParams genInclusion
#' @param file The inclusion or varible list.
extract_key_long <- function(research.folder, database, file, key.var) {
  request <- check_long(research.folder = research.folder,
                        database = database,
                        request_file = file,
                        template_file = file)
  request <- as.data.frame(request)
  if (!(key.var %in% colnames(request))) {
    stop(simpleError(
      paste(key.var, "is not consistent in the request inclusion criteria.",
            "The first column that is not `sno` is treated as key variable.")
    ))
  }
  key <- extract_selection(selection = request[, "selection"],
                           selection_symbol = "x", key.var = request[, key.var])
  if (length(key) == 0) {
    warning(simpleWarning(
      paste(
        "Using logical statement in the first entry of column `logic`",
        "in `overall list` to determine inclusion criteria.\n"
        # "No", key.var, "satisfies inclusion criteria in '", file,
        # "' selected accroding to selection column.\n"
      )
    ))
    key <- try(extract_selection_logic(
      logic = request[, "logic"], request = request, key.var = key.var
    ))
  }
  list(key = key)
}
#' Read in inclusion and variable (wrapper)
#' @inheritParams genInclusion
#' @inheritParams extract_key_long
#' @inheritParams access_bridge
extract_key <- function(research.folder, database, file, key.var, key.desc = NA,
                        type) {
  version_mode <- unlist(strsplit(file, split = ".", fixed = TRUE))
  is_short <- "short" %in% tolower(version_mode)
  if (is_short) {
    message(simpleMessage(
      if (type == "inclusion") {
        paste0(" ** Inclusion criteria '", file, "' ", "is in short version.\n")
      } else {
        paste0(" ** Variable list '", file, "' ", "is in short version.\n")
      }
    ))
    extract_key_short(research.folder = research.folder, file = file,
                      key.var = key.var, key.desc = key.desc)
  } else {
    message(simpleMessage(
      if (type == "inclusion") {
        paste0(" ** Inclusion criteria '", file, "' ", "is in long version.\n")
      } else {
        paste0(" ** Variable list '", file, "' ", "is in long version.\n")
      }
    ))
    # check template subfolder
    template_folder <- paste0(database, "_template")
    if (!dir.exists(template_folder)) {
      stop(simpleError("Please use functions `genInclusion` and `genVariable` to generate inclusion criteria and variable lists before extracting data."))
    }

    key <- extract_key_long(research.folder = research.folder,
                            database = database,
                            file = file, key.var = key.var)
    key$is_short <- FALSE
    key
  }
}

# Compare short inclusion -------------------------------------------------

#' Write a table to compare key.var and key.desc for short version inclusion
#' @param setting List of settings produced using \code{get_setting}.
#' @param key List of information on key variable produced using
#'   \code{extract_key}.
#' @param dat A data frame.
compare_key_short <- function(setting, key, dat) {
  if (is.na(setting$key.desc) & key$is_expanded) {
    if (key$is_expanded) {
      key.var_dat <- as.data.frame(unique(dat[, setting$key.var, with = FALSE]))
      key.var_req <- as.data.frame(key$key)
      names(key.var_req) <- setting$key.var
      summary_comp <- merge(key.var_dat, key.var_req, by = setting$key.var,
                            all = TRUE, sort = TRUE,
                            suffixes = c(".dat", ".req"))
      file <- paste0("compare_requirement_", setting$table_name,
                     "_", setting$key.var, "summary.xlsx")
      list(table = summary_comp, file = file)
    } else {
      NULL
    }
  } else {
    key.var_dat <- as.data.frame(unique(dat[, c(setting$key.var, setting$key.desc),
                                            with = FALSE]))
    key.var_req <- as.data.frame(cbind(key$key, key$desc))
    names(key.var_req) <- c(setting$key.var, setting$key.desc)
    summary_comp <- merge(key.var_dat, key.var_req, by = setting$key.var,
                          all = TRUE, sort = TRUE,
                          suffixes = c(".dat", ".req"))
    file <- paste0("compare_requirement_", setting$table_name,
                   "_", setting$key.var,
                   "(", paste(setting$key.desc, collapse = "_"), ")",
                   "summary.xlsx")
    list(table = summary_comp, file = file)
  }
}

# Extract and merge inclusion data ----------------------------------------

#' Merge inclusion data or variable list
#' @param data_list A list of data to merge.
#' @inheritParams extract_data
#' @import data.table
merge_data <- function(data_list, dataLogic) {
  dat_m <- as.data.frame(data_list[[1]]$dat)
  if (length(data_list) == 1) {
    return(dat_m)
  }
  colnames(dat_m) <- data_list[[1]]$cname2
  key_m <- data_list[[1]]$identifier_var
  # key_all <- key_m
  for (i in 2:length(data_list)) {
    dat_i <- as.data.frame(data_list[[i]]$dat)
    colnames(dat_i) <- data_list[[i]]$cname2
    key_i <- data_list[[i]]$identifier_var
    # key_all <- union(key_all, key_i)
    key_m <- intersect(key_m, key_i)
    dat_m <- merge(dat_m, dat_i, by = key_m, all = (dataLogic == "union"))
    # key.var of the merged data include key.var of all data that are merged
    key_m <- union(key_m, key_i)
  }
  as.data.table(dat_m)
}
#' Find the intersection or union of all inclusion, based on \code{dataLogic}
#' @param inclu_list A list of inclusion to combine.
#' @inheritParams extract_data
merge_inclu <- function(inclu_list, dataLogic) {
  if (length(inclu_list) == 1) {
    id_merged <- as.list(inclu_list[[1]]$dat[, inclu_list[[1]]$identifier_var,
                                             with = FALSE])
    id_merged <- lapply(id_merged, function(id) unique(id))
    inclu_list[[1]]$cname2 <- colnames(inclu_list[[1]]$dat)
    return(list(inclu_list = inclu_list, data_merged = inclu_list[[1]]$dat,
                id_merged = id_merged,
                id_common = inclu_list[[1]]$identifier_var,
                key_all = inclu_list[[1]]$key.var))
  }
  id <- find_common(lapply(inclu_list, function(l) l$identifier_var))
  id_all <- id$all
  if (is.null(id$common)) {
    stop(simpleError(paste(
      "There is no common identifier variable for all inclusion data.",
      "Data cannot be extracted based on such inclusion criteria."
    )))
  }
  id_common <- id$common
  # Rename variables if they exist in more than 1 inclusion tables
  # Do not rename identifier_var
  id_list <- lapply(inclu_list, function(l) l$identifier_var)
  key_list <- lapply(inclu_list, function(l) l$key.var)
  desc_list <- lapply(inclu_list, function(l) l$key.desc)
  all_list <- lapply(inclu_list, function(l) colnames(l$dat))
  key_list2 <- rename_cols(name_list = key_list, all_list = all_list,
                           key_list = id_list)
  desc_list2 <- rename_cols(name_list = desc_list, all_list = all_list,
                            key_list = id_list)
  cname_list2 <- rename_cols(name_list = all_list, all_list = all_list,
                             key_list = id_list)
  for (i in 1:length(inclu_list)) {
    inclu_list[[i]]$cname2 <- cname_list2[[i]]
    inclu_list[[i]]$key.desc2 <- desc_list2[[i]]
    inclu_list[[i]]$key.var2 <- key_list2[[i]]
  }
  inclu_m <- merge_data(data_list = inclu_list, dataLogic = dataLogic)
  id_merged <- as.list(inclu_m[, unique(id_all), with = FALSE])
  id_merged <- lapply(id_merged, unique)
  # Filter raw inclusion data
  inclu_list <- lapply(inclu_list, function(l) {
    l$dat <- filter_data(dat = l$dat, identifier_var = l$identifier_var,
                         id_merged = id_merged)
    l
  })
  list(inclu_list = inclu_list, data_merged = inclu_m,
       id_merged = id_merged, id_common = id_common,
       key_all = unlist(lapply(inclu_list, function(l) l$key.var2)))
}
#' Processing request form
#' @inheritParams extract_data
process_inclu <- function(research.folder, inclusion.xls.file, dataLogic,
                          overwrite, username=NA, password=NA){
  inclu_list <- lapply(seq_along(inclusion.xls.file), function(i) {
    setting <- get_setting(research.folder = research.folder,
                           file = inclusion.xls.file[i])
    if (basename(setting$database) == "private_data") {
      setting$db <- "private"
    } else {
      setting$db <- "public"
    }
    database <- database_full(research.folder,setting$db,setting$data.type)
    # Extract selected values of key.var and key.desc (if any)
    key <- extract_key(research.folder = research.folder,
                       database = database[2],
                       file = inclusion.xls.file[i],
                       key.var = setting$key.var,
                       key.desc = setting$key.desc, type = "inclusion")
    # Load inclusion data
    # database <- database_full(research.folder,setting$database,setting$data.type)
    # print(database)
    dat_full <- access_bridge(data.type = setting$data.type,
                              database = setting$database,
                              table_name = setting$table_name,
                              username = username,
                              password = password)$dat
    if (!any(setting$identifier_var %in% colnames(dat_full))) {
      id_vec <- setting$identifier_var[!setting$identifier_var %in%
                                         colnames(dat_full)]
      stop(simpleError(
        paste("Identifier variables", toString(id_vec),
              "are not found in table", setting$table_name)
      ))
    }
    if (!setting$key.var %in% colnames(dat_full)) {
      stop(simpleError(
        paste("Key variable", setting$key.var, "is not found in table",
              setting$table_name)
      ))
    }
    if (any(!is.na(setting$key.desc) & !setting$key.desc %in% colnames(dat_full))) {
      stop(simpleError(
        paste("Key variable description", setting$key.desc,
              "is not found in table", setting$table_name)
      ))
    }
    # Extract subset of interest
    dat <- dat_full[get(setting$key.var) %in% key$key,
                    na.omit(c(setting$identifier_var, setting$key.var,
                              setting$key.desc)),
                    with = FALSE]
    dat <- unique(dat)
    if (nrow(dat) == 0) {
      warning(simpleWarning(
        paste("No rows in", setting$table_name,
              "satisfy inclusion criteria specified.")
      ))
    }
    # If we use identifier_var as key.var, there will be duplicated columns in
    # an inclusion data. In this case we will remove duplicates.
    dat <- dat[, unique(names(dat)), with = FALSE]
    # Compare key.desc for short version
    if (key$is_short) {
      summary_comp <- compare_key_short(setting = setting, key = key, dat = dat)
      if (!is.null(summary_comp)) {
        write_to_file(table = summary_comp$table,
                      file = file.path("research", research.folder,
                                       "request_output", summary_comp$file),
                      sheetName = "Compare.request", overwrite = overwrite)
      }
    }
    list(dat = dat, file_name = inclusion.xls.file[i],
         table_name = setting$table_name,
         key.var = setting$key.var, key.desc = setting$key.desc,
         identifier_var = setting$identifier_var, index = i)
  })
  # Merge inclusion data based on dataLogic
  merge_inclu(inclu_list, dataLogic)
}

# Summarise inclusion -----------------------------------------------------

#' Count based on inclusion
#' @inheritParams merge_inclu
count_inclu <- function(inclu_list) {
  id_vec <- names(inclu_list$id_merged)
  # Count unique identifiers
  count_all <- do.call("rbind", lapply(id_vec, function(id) {
    message(simpleMessage(
      paste0(" ** Counting the total unique number of ", id, "\n")
    ))
    c(paste("Total unique", id), length(unique(inclu_list$id_merged[[id]])))
  }))
  colnames(count_all) <- c("Item", "Summary")
  # Count key variable by identifier
  if (nrow(inclu_list$data_merged) == 0) {
    warning(simpleWarning(
      " ** No data extracted with inclusion criteria specified.\n"
    ))
    count_key <- NULL
  } else {
    count_key <- lapply(inclu_list$key_all, function(key) {
      message(simpleMessage(
        paste0(" ** Counting the total unique number of ", key,
               " by each identifier variable\n")
      ))
      dat_m <- as.data.frame(
        unique(inclu_list$data_merged[, c(key, id_vec[1]), with = FALSE])
      )
      dat_m$n <- 1
      count <- aggregate(dat_m$n, by = list(dat_m[, key]), sum)
      colnames(count) <- c(key, paste("Count by", id_vec[1]))
      if (length(id_vec) > 1) {
        for (i in 2:length(id_vec)) {
          dat_m <- as.data.frame(
            unique(inclu_list$data_merged[, c(key, id_vec[i]), with = FALSE])
          )
          dat_m$n <- 1
          count_i <- aggregate(dat_m$n, by = list(dat_m[, key]), sum)
          colnames(count_i) <- c(key, paste("Count by", id_vec[i]))
          count <- merge(count, count_i, by = 1, all = TRUE)
        }
      }
      count[, -1] <- apply(as.data.frame(count[, -1]), 2, function(x) {
        x[is.na(x)] <- 0
        x
      })
      count
    })
  }
  list(count_all = count_all, count_key = count_key)
}

# Extract and merge variable list -----------------------------------------

#' Filter data based on identifier variable
#' @param dat A data.frame.
#' @param identifier_var Identifier variable.
#' @param id_merged A data.frame of merged identifiers.
filter_data <- function(dat, identifier_var, id_merged) {
  id_common <- intersect(identifier_var, names(id_merged))
  rows <- unlist(lapply(id_common, function(id) {
    which(unlist(dat[, id, with = FALSE]) %in% id_merged[[id]])
  }))
  count <- as.data.frame(table(rows), stringsAsFactors = FALSE)
  rows <- as.numeric(count$rows[count$Freq == length(id_common)])
  if (length(rows) == 0) {
    warning(simpleWarning("No data extracted with inclusion criteria specified.\n"))
  }
  unique(dat[rows, ])
}
#' Find the intersection or union of all variable lists, based on
#' \code{dataLogic}
#' @param var_list A list of variable lists.
#' @inheritParams extract_data
merge_var <- function(var_list, dataLogic) {
  if (length(var_list) == 1) {
    id_merged <- as.list(var_list[[1]]$dat[, var_list[[1]]$identifier_var,
                                           with = FALSE])
    id_merged <- lapply(id_merged, function(id) unique(id))
    var_list[[1]]$cname2 <- colnames(var_list[[1]]$dat)
    return(list(var_list = var_list, data_merged = var_list[[1]]$dat,
                id_merged = id_merged,
                id_common = var_list[[1]]$identifier_var))
  }
  id <- find_common(lapply(var_list, function(l) l$identifier_var))
  id_all <- id$all
  if (is.null(id$common)) {
    warning(simpleWarning("There is no common identifier variable for all variable lists. Data cannot be merged based on such variable lists."))
    return(list(var_list = var_list, data_merged = NULL,
                id_merged = id_merged,
                id_common = var_list[[1]]$identifier_var))
  }
  id_common <- id$common
  # Rename variables if they are exist in more than 1 inclusion tables
  # Do not rename identifier_var
  id_list <- lapply(var_list, function(l) l$identifier_var)
  all_list <- lapply(var_list, function(l) colnames(l$dat))
  cname_list2 <- rename_cols(name_list = all_list, key_list = id_list)
  for (i in 1:length(var_list)) {
    var_list[[i]]$cname2 <- cname_list2[[i]]
  }
  var_m <- merge_data(data_list = var_list, dataLogic = dataLogic)
  id_merged <- as.list(var_m[, unique(id_all), with = FALSE])
  id_merged <- lapply(id_merged, unique)
  list(var_list = var_list, data_merged = var_m,
       id_merged = id_merged, id_common = id_common)
}
#' Process variable list
#' @inheritParams extract_data
#' @inheritParams merge_inclu
process_var <- function(research.folder, variable.xls.file,
                        inclu_list, overwrite, username = NA, password = NA) {
  if (all(is.na(variable.xls.file))) {
    return(NULL)
  }
  id.no.keep <- NULL
  var_list <- lapply(seq_along(variable.xls.file), function(i) {
    setting <- get_setting(research.folder = research.folder,
                           file = variable.xls.file[i])
    if (basename(setting$database) == "private_data") {
      setting$db <- "private"
    } else {
      setting$db <- "public"
    }
    database <- database_full(research.folder, setting$db, setting$data.type)

    # Extract selected values of key.var and key.desc (if any)
    var_names <- extract_key(research.folder = research.folder,
                             database = database[2],
                             file = variable.xls.file[i],
                             key.var = "variable", type = "variable")

    # Load inclusion data
    dat_full <- access_bridge(data.type = setting$data.type,
                              database = setting$database,
                              table_name = setting$table_name,
                              username = username,
                              password = password)$dat
    if (!any(setting$identifier_var %in% colnames(dat_full))) {
      id_vec <- setting$identifier_var[!setting$identifier_var %in%
                                         colnames(dat_full)]
      stop(simpleError(
        paste("Identifier variables", toString(id_vec),
              "are not found in table", setting$table_name)
      ))
    }
    #dat <- dat_full[, var_names$key, with = FALSE]
    dat <- dat_full[, unique(c(setting$identifier_var, var_names$key)),
                    with = FALSE]
    dat <- filter_data(dat = unique(dat),
                       identifier_var = setting$identifier_var,
                       id_merged = inclu_list$id_merged)

    list(dat = dat, file_name = variable.xls.file[i],
         table_name = setting$table_name,
         identifier_var = setting$identifier_var,
         other_var = setdiff(colnames(dat), setting$identifier_var), index = i)
  })
  id.keep <- unique(unlist(lapply(seq_along(variable.xls.file), function(i){
    var_list[[i]]$other_var
  })))
  #identifier_var <- var_list[[1]]$identifier_var
  var_list <- merge_var(var_list = var_list, dataLogic = "Union")
  # Summarise each variable in the merged data
  var_m <- var_list$data_merged
  if (nrow(var_m) == 0) {
    var_summ <- NULL
  } else {
    identifier_var <- var_list[[1]]$identifier_var
    id_no_keep <- which(names(var_m) %in% identifier_var)

    var_summ <- do.call("rbind", lapply(names(var_m)[which(names(var_m) %in% id.keep)], function(var) {
      x <- as.data.frame(var_m)[, var]
      if (var %in% c(names(inclu_list$id_merged), inclu_list$key_all,
                     names(var_list$id_merged))) {
        summ <- summarise_cat(x)
        cbind(Variable = c(var, rep("", nrow(summ) - 1)), summ)
      } else {
        if (is.character(x) | is.factor(x) | is.logical(x) |
            inherits(x, "Date") | inherits(x, "POSIXct")) {
          summ <- summarise_cat(x)
          cbind(Variable = c(var, rep("", nrow(summ) - 1)), summ)
        } else {
          cbind(Variable = var, summarise_cts(x))
        }
      }
    }))
  }
  list(var_list = var_list, var_summ = var_summ)
}

# Summarise variable list -------------------------------------------------

#' Summarise a continuous variable
#' @param x Variable to summarise.
summarise_cts <- function(x) {
  data.frame(N = sum(!is.na(x)), Group = NA,
             Summary = sprintf("%.2f (%.2f)",
                               mean(x, na.rm = TRUE), sd(x, na.rm = TRUE)),
             Type = "Mean(S.D.)")
}
#' @describeIn summarise_cts Summarise a continuous variable
summarise_cat <- function(x) {
  if (all(is.na(x))) {
    data.frame(N = 0, Group = NA, Summary = sprintf("%d (%.1f%%)", NA, NA),
               Type = "")
  } else {
    count <- as.data.frame(table(x))
    count$Prop <- count$Freq / length(x)
    summ <- data.frame(Group = count$x,
                       Summary = paste0(count$Freq,
                                        " (", round(count$Prop * 100, 2), "%)"),
                       Type = "N(%)")
    summ <- cbind(N = c(length(x), rep(NA, nrow(summ) - 1)), summ)
    if (anyNA(x)) {
      rbind(
        summ,
        data.frame(N = NA, Group = NA,
                   Summary = sprintf("%d (%.1f%%)", sum(is.na(x)),
                                     sum(is.na(x)) / length(x) * 100),
                   Type = "")
      )
    } else {
      summ
    }
  }
}

# Write information to file -----------------------------------------------

#' Check whether file exists and write file
#' @param table Table to write to file.
#' @param file Excel file to write.
#' @param sheetName Name of Excel sheet to assign.
#' @param overwrite Whether to overwrite existing file.
#' @details No need to check \code{overwrite} when adding sheets to the file. In
#'   this case, set it to \code{NULL}.
#' @import xlsx
write_to_file <- function(table, file, sheetName, overwrite = NULL) {
  if (!is.null(overwrite)) {
    if (file.exists(file)) {
      msg <- paste("The file:'", file, "' already exists")
      if (overwrite) {
        msg <- paste(msg, "and will be overwritten\n")
        file.remove(file)
      } else {
        stop(simpleError(paste0(msg, ".\n")))
      }
    }
    write.xlsx(table, file = file, sheetName = sheetName,
               row.names = FALSE, showNA = FALSE)
  } else {
    write.xlsx(table, file = file, sheetName = sheetName,
               row.names = FALSE, showNA = FALSE, append = TRUE)
  }
}
#' Write the list of unique identifiers to \code{csv}
#' @param id_merged List of identifiers to write.
#' @inheritParams access_bridge
#' @inheritParams extract_data
write_id <- function(id_merged, type, research.folder) {
  # type is either "inclusion" or "variable"
  # len_max <- max(unlist(lapply(id_merged, function(id) length(id))))
  # id_merged_mat <- do.call("cbind", lapply(id_merged, function(id) {
  #   c(id, rep("", len_max - length(id)))
  # }))
  id_merged_mat <- unique(id_merged)
  write.csv(id_merged_mat,
            file = paste0("research/", research.folder,
                          "/request_output/", type, "_identifier_var.csv"),
            row.names = FALSE, na = "")
  id_merged
}
#' Write a list of raw data to a group of \code{csv}
#' @param data_list The list of data to write.
#' @inheritParams write_id
write_data_raw <- function(data_list, type, research.folder) {
  lapply(data_list, function(l) {
    dat <- l$dat
    colnames(dat) <- l$cname2
    # file = paste0("research/", research.folder,
    #               "/request_output/extract_dat_", type, "_raw_dat_",
    #               l$index, ".csv")
    file <- file.path("research", research.folder, "request_output",
                      paste0("extract_dat_", l$file_name, ".csv"))
    write.csv(dat, file = file, row.names = FALSE, na = "")
    return(dat)
  })
}
biostatUniBS/RDataXMan documentation built on Feb. 2, 2021, 9:41 a.m.