R/format_odk_data.R

Defines functions combine_columns format_text_fields format_multiselect_asws extract_complex_data_from_odk_server extract_additional_data_from_odk_zip extract_data_from_odk_server extract_data_from_odk_zip format_odk_metadata

Documented in combine_columns extract_additional_data_from_odk_zip extract_complex_data_from_odk_server extract_data_from_odk_server extract_data_from_odk_zip format_multiselect_asws format_odk_metadata format_text_fields

#' Format ODK metadata
#' The function removes ODK note columns (which are systematically empty columns) from the outputted dataframe
#'
#' @param df dataframe containing the non de-identified (raw) ODK data, assuming standard metadata fields (`today`, `start`, `end`) are present.
#' @param start_date start date (optional, by default set to `NULL`)
#' @param end_date end date (optional, by default set to `NULL`)
#' @return This function returns a formatted dataframe for future display and analysis.
#' @import magrittr dplyr lubridate
#' @export

format_odk_metadata <- function(df,
                                start_date = NULL,
                                end_date = NULL) {

  if (dim(df)[1] > 0) {
    df$SubmissionDate <- strftime(strptime(x = df$SubmissionDate, format = "%Y-%m-%dT%T"))
    df$start <- strftime(strptime(x = df$start, format = "%Y-%m-%dT%T"))
    df$end <- strftime(strptime(x = df$end, format = "%Y-%m-%dT%T"))

    df$start_time <- strftime(df$start, "%T")
    df$end_time <- strftime(df$end, "%T")

    df$today <- strftime(df$start, "%Y-%m-%d")
    df$duration <- floor(difftime(df$end, df$start, units = "mins"))

    df <- df %>%
      dplyr::rename(date = today)

    # Filter by start date and end date
    if (!is.null(start_date)) {
      df <- df %>%
        dplyr::filter(date >= as.Date(start_date, "%Y-%m-%d"))
    }
    if (!is.null(end_date)) {
      df <- df %>%
        dplyr::filter(date <= as.Date(end_date, "%Y-%m-%d"))
    }

    # Remove ODK note columns (which are systematically empty columns and identified by the string generated_note_name) from the outputted dataframe
    cols <- colnames(df)
    drops <- cols[grepl("generated_note_name_", cols)]
    df <- df[, !(names(df) %in% drops)]

    df
  }

}

#' Unzip and extract ODK data from ODK zip
#'
#' @param odk_zip absolute path to the zip file named "`fid`.zip" containing ODK submissions as CSV, plus separate CSVs for any repeating groups, plus any attachments in a subfolder `media`
#' @param csv_name name of the .CSV file
#' @param start_date start date (optional, by default set to `NULL`)
#' @param end_date end date (optional, by default set to `NULL`)
#' @param local_dir local directory (optional, by default set to `tempdir()`)
#' @param col_specs column specifications (optional)
#' @return This function returns a formatted dataframe for future display and analysis.
#' @import readr utils fs
#' @export

extract_data_from_odk_zip <- function(odk_zip, csv_name,
                                      start_date = NULL,
                                      end_date = NULL,
                                      local_dir = tempdir(),
                                      col_specs = NULL) {

  utils::unzip(odk_zip, exdir = local_dir)
  #fn <- fs::dir_ls(local_dir, glob=paste0("*", csv_name))
  #print(fn)
  #raw_odk_data <- fn %>% readr::read_csv()
  fs::dir_ls(local_dir)
  raw_odk_data <- readr::read_csv(file.path(local_dir, csv_name),
                                  col_types = cols(.default = "c"))
  timci::format_odk_metadata(raw_odk_data,
                             start_date,
                             end_date)

}

#' Extract data from ODK server to a formatted dataframe.
#' This function relies on ruODK to export submissions.
#'
#' @param cpid integer, ODK project ID
#' @param cpid_forms list of form IDs in `cpid`
#' @param cfid string, ODK form ID
#' @param cpp string, ODK project passphrase (optional, only required if the ODK project is encrypted)
#' @param start_date date, data collection start date (optional, by default set to `NULL`)
#' @param end_date date, data collection end date (optional, by default set to `NULL`)
#' @param filter OData filter (optional, by default set to `NULL`)
#' @param col_specs column specifications (optional, by default set to `NULL`)
#' @param group keep group names (optional, by default set to `TRUE`)
#' @param split split “select multiple” choices into columns (optional, by default set to `FALSE`)
#' @param verbose boolean, displays more information about the function output
#' @return This function returns a formatted dataframe for future display and analysis.
#' @import ruODK
#' @export

extract_data_from_odk_server <- function(cpid,
                                         cpid_forms,
                                         cfid,
                                         cpp="",
                                         start_date = NULL,
                                         end_date = NULL,
                                         filter = NULL,
                                         col_specs = NULL,
                                         group = TRUE,
                                         split = FALSE,
                                         verbose = FALSE) {

  df <- NULL
  cdir <- tempdir()

  if (cfid %in% cpid_forms) {
    odk_zip <- ruODK::submission_export(local_dir = cdir,
                                        pid = cpid,
                                        fid = cfid,
                                        pp = cpp,
                                        filter = filter,
                                        delfields = FALSE,
                                        group = group,
                                        split = split,
                                        media = FALSE)

    # Extract the XML representation of the form
    fq_form_xml <- ruODK::form_xml(parse = FALSE,
                                   pid = cpid,
                                   fid = cfid)

    df <- timci::extract_data_from_odk_zip(odk_zip = odk_zip,
                                           csv_name = paste0(cfid,".csv"),
                                           start_date = start_date,
                                           end_date = end_date,
                                           local_dir = cdir,
                                           col_specs = col_specs)
  }

  if (verbose == TRUE) {
    if (!is.null(df)) {
      write(paste0("Data successfully downloaded: ", nrow(df), " row(s)."), stderr())
    } else {
      write("Data could not be downloaded.", stderr())
    }
  }

  df

}

#' Unzip and extract additional data from ODK zip
#'
#' @param odk_zip absolute path to the zip file named "`fid`.zip" containing ODK submissions as CSV, plus separate CSVs for any repeating groups, plus any attachments in a subfolder `media`
#' @param csv_name name of the .CSV file
#' @param local_dir local directory (optional, by default set to `tempdir()`)
#' @return This function returns a formatted dataframe for future display and analysis.
#' @import readr utils fs
#' @export

extract_additional_data_from_odk_zip <- function(odk_zip,
                                                 csv_name,
                                                 local_dir = tempdir()) {

  utils::unzip(odk_zip, exdir = local_dir)
  fs::dir_ls(local_dir)
  fn <- file.path(local_dir, csv_name)
  df <- NULL
  if ( file.exists(fn) ) {
    df <- raw_odk_data <- readr::read_csv(fn,
                                          guess_max = 2000)
  }
  df

}

#' Extract complex data from ODK server to a a list of dataframes.
#' This function relies on ruODK to export submissions.
#'
#' @param cpid integer, ODK project ID
#' @param cpid_forms list of form IDs in `cpid`
#' @param cfid string, ODK form ID
#' @param cpp string, ODK project passphrase (optional, only required if the ODK project is encrypted)
#' @param start_date date, data collection start date (optional, by default set to `NULL`)
#' @param end_date date, data collection end date (optional, by default set to `NULL`)
#' @param filter OData filter (optional, by default set to `NULL`)
#' @param col_specs column specifications (optional, by default set to `NULL`)
#' @param group keep group names (optional, by default set to `TRUE`)
#' @param split split “select multiple” choices into columns (optional, by default set to `FALSE`)
#' @param verbose boolean, displays more information about the function output
#' @return This function returns a list of dataframes.
#' @import ruODK
#' @export

extract_complex_data_from_odk_server <- function(cpid,
                                                 cpid_forms,
                                                 cfid,
                                                 cpp="",
                                                 start_date = NULL,
                                                 end_date = NULL,
                                                 filter = NULL,
                                                 col_specs = NULL,
                                                 group = TRUE,
                                                 split = FALSE,
                                                 verbose = FALSE) {

  out <- NULL
  cdir <- tempdir()

  if (cfid %in% cpid_forms) {
    odk_zip <- ruODK::submission_export(local_dir = cdir,
                                        pid = cpid,
                                        fid = cfid,
                                        pp = cpp,
                                        filter = filter,
                                        delfields = FALSE,
                                        group = group,
                                        split = split,
                                        media = TRUE)
    # Extract ODK submissions
    df <- timci::extract_data_from_odk_zip(odk_zip = odk_zip,
                                           csv_name = paste0(cfid,".csv"),
                                           start_date = start_date,
                                           end_date = end_date,
                                           local_dir = cdir,
                                           col_specs = col_specs)

    # Extract and append other CSV attachments
    out <- list(df)
    files <- list.files(path = cdir,
                        pattern = glob2rx(paste0(cfid,"*.csv")))
    i = 2
    for (f in files){
      if (f != paste0(cfid,".csv")) {
        if (verbose == TRUE) {
          write(f, stderr())
        }
        df_tmp <- timci::extract_additional_data_from_odk_zip(odk_zip = odk_zip,
                                                              csv_name = f,
                                                              local_dir = cdir)
        out[[i]] <- df_tmp
        i <- i + 1
      }
    }

  }

  if (verbose == TRUE) {
    if (!is.null(df)) {
      write(paste0("Data successfully downloaded: ", nrow(df), " row(s)."), stderr())
    } else {
      write("Data could not be downloaded.", stderr())
    }
  }
  out

}

#' Format multiple select answers so as to separate them
#'
#' @param df dataframe containing ODK data
#' @param cols list of column names
#' @param sep separator, e.g. ";" "," etc
#' @return This function returns a dataframe with multiple answers separated by `sep`.
#' @import stringr
#' @export

format_multiselect_asws <- function(df, cols, sep) {

  dfcols <- colnames(df)

  # Replace the space between different answers by `sep` in multiple select questions
  for (c in cols) {
    if (c %in% dfcols) {
      df[[c]] <- stringr::str_replace_all(df[[c]], " ", sep)
    }
  }
  df

}

#' Replace commas by semicolons in text fields for CSV export
#'
#' @param df dataframe containing ODK data
#' @param cols list of column names
#' @return This function returns a dataframe with multiple answers separated by `sep`.
#' @import stringr
#' @export

format_text_fields <- function(df, cols) {

  dfcols <- colnames(df)

  # Replace the space between different answers by `sep` in multiple select questions
  for (c in cols) {
    if (c %in% dfcols) {
      df[[c]] <- stringr::str_replace_all(df[[c]], ",", "-")
      df[[c]] <- stringr::str_replace_all(df[[c]], ";", "-")
      df[[c]] <- stringr::str_replace_all(df[[c]], "\n", " ")
      df[[c]] <- stringr::str_replace_all(df[[c]], '"', "")
    }
  }
  df

}

#' Combine 2 columns - function in progress (to be tested)
#'
#' @param df Dataframe containing the non de-identified (raw) ODK data collected at the facility level
#' @param cols1 Vector of reference column names
#' @param cols2 Vector of column names to be merged with the reference columns
#' @return This function returns a formatted dataframe for future display and analysis.
#' @export

combine_columns <- function(df, cols1, cols2) {

  dfcols <- colnames(df)

  for (i in 1:length(cols1)) {
    c1 <- cols1[i]
    c2 <- cols2[i]
    if (c1 %in% dfcols) {
      if (c2 %in% dfcols) {
        df[[c1]] <- ifelse(!is.na(df[[c1]]), df[[c1]], df[[c2]])
      }
    }
  }
  df

}
Thaliehln/timci documentation built on April 8, 2024, 3:38 p.m.