godataR: Connecting R with the Go.Data Application

Documented in get_contacts_epiwindow

#' Import contacts from Go.Data notified within a specific date range
#'
#' @author Amy Mikhail, \email{amy.mikhail@@gmail.com}
#'
#' @description
#' This function imports contacts to R from Go.Data with contact report dates
#' that fall within a specific date range.  There are three options for
#' specifying the date ranges:
#'   1. "date range": user specifies a minimum and maximum date
#'   2. "epiwindow": user specifies the last x days to return
#'   3. "epibuffer": adds the epiwindow to the minimum and maximum date
#'
#' The date that is queried is the date that cases were notified on (a mandatory
#' field in all contact investigation forms in Go.Data called dateOfReporting)
#'
#' Users can also specify whether to return "all" available columns in the case
#' data export API, or only "identifiers" (a subset of columns comprising only
#' case identifying data such as first names, last names and dates of birth).
#' Selecting "identifiers" is recommended for record linkage with another data
#' set (such as linking lab results to Go.Data contact IDs) as this smaller subset
#' of 9 columns will be returned much faster and use less memory in R for date
#' ranges that include a lot of contacts.
#'
#' @details
#' **Defining the epiwindow:**
#' The user first specifies a suitable illness episode window in days (the
#' number of days beyond which a case still producing positive samples is likely
#' to have been reinfected).  The episode window to use should be determined
#' with reference to the pathogen characteristics, as well as national and
#' international case definitions and criteria for defining reinfections. The
#' episode window is then applied to a date range, and cases are returned if
#' they are within the episode window of the minimum and maximum dates provided.
#'
#' **Defining date element orders**
#' If minimum and maximum dates are supplied to define the date range, the date
#' element order must be defined (which order is the year, month and day in).
#' For example, for a mindate of "15/08/2022" and maxdate of "30/08/2022" the
#' order is first day, then month, then year and the option to select is "dmy".
#' Options are as follows:
#'   + "ymd" select this for year first, then month, then date
#'   + "dmy" select this for day first, then month, then year
#'   + "mdy" select this for month first, then day, then year
#' Note that any separator can be used between the date elements.
#'
#' **Prerequisites:**
#' Note that this function requires Go.Data user credentials (username,
#' password and the URL or web address of the Go.Data instance).  Users must
#' have permission to export case data within Go.Data.  By default, contacts
#' will be returned for the user's active outbreak.  If the user wishes to
#' query a different outbreak, the Go.Data outbreak ID for the outbreak of
#' interest should be supplied.  To obtain the IDs of non-active outbreaks, use
#' `godataR::get_all_outbreaks()` before running this function.
#'
#' @md
#'
#' @param url URL (web address) for Go.Data instance
#' @param username User email address used to log in to Go.Data
#' @param password User password used to log in to Go.Data
#' @param outbreak Outbreak to use; "active" (default) or other outbreak ID
#' @param cols2return Set of columns to return; either "identifiers" or "all"
#' @param datequery Date query method; "date range", "epiwindow" or "epibuffer"
#' @param daterangeformat Min & max date element order; "ymd", "dmy" or "mdy"
#' @param epiwindow User-defined illness episode window in days (integer)
#' @param mindate Minimum contact reporting date for date range
#' @param maxdate Maximum contact reporting date for date range
#'
#' @return
#' Returns data.frame of case data, including Go.Data case ID
#'
#' @import lubridate
#' @import jsonlite
#' @import httr
#' @import dplyr
#' @import tidyr
#' @import purrr
#'
#' @examples
#' \dontrun{
#' # Get contacts from the active outbreak notified within the last 30 days:
#' contacts <- get_contacts_epiwindow(url = url,
#'                                    username = username,
#'                                    password = password,
#'                                    cols2return = "all",
#'                                    datequery = "epiwindow",
#'                                    epiwindow = 30)
#'
#' # Get contacts from 01 August to 25 September 2022 with a 30-day buffer:
#' contacts <- get_contacts_epiwindow(url = url,
#'                                    username = username,
#'                                    password = password,
#'                                    cols2return = "identifiers",
#'                                    datequery = "epibuffer",
#'                                    epiwindow = 30,
#'                                    daterangeformat = "dmy",
#'                                    mindate = "01/08/2022",
#'                                    maxdate = "25/09/2022")
#'
#' # View the result:
#' contacts
#' }
#' @export
get_contacts_epiwindow <- function(url,
                                   username,
                                   password,
                                   outbreak = "active",
                                   cols2return = c("identifiers",
                                                   "all"),
                                   datequery = c("date range",
                                                 "epiwindow",
                                                 "epibuffer"),
                                   daterangeformat = c("ymd",
                                                       "dmy",
                                                       "mdy"),
                                   epiwindow,
                                   mindate = NULL,
                                   maxdate = NULL){


  # Check if requisite arguments are supplied, exit with an error if not:
  if(datequery == "date range" &
     (is.null(mindate)
      | is.null(maxdate)
      | is.null(daterangeformat))){

    stop("Some arguments required to perform the date query are missing.
         For 'date range', specify mindate, maxdate and daterangeformat.")

  } else if(datequery == "epiwindow" &
            (is.null(epiwindow))){

    stop("Some arguments required to perform the date query are missing.
         For 'epiwindow', specify the epiwindow in days.")

  } else if(datequery == "epibuffer" &
            (is.null(mindate)
             | is.null(maxdate)
             | is.null(daterangeformat)
             | is.null(epiwindow))){

    stop("Some arguments required to perform the date query are missing.
         For 'epibuffer', specify mindate, maxdate, daterangeformat & epiwindow")

  }

  # Check if password needs converting from raw bytes:
  if(is.raw(password)){password = rawToChar(password)}

  ####################################
  # 01. Define date ranges:
  ####################################

  # Check that epiwindow is in the correct format:
  if(!is.null(epiwindow)){

    epiwindow = as.numeric(epiwindow)

  }

  # Define date ranges:
  if(datequery == "date range"){

    mindate = mongify_date(mindate, dateformat = daterangeformat)
    maxdate = mongify_date(maxdate, dateformat = daterangeformat)

  } else if(datequery == "epiwindow"){

    # Subtract the epiwindow from today's date to get the minimum date:
    mindate = mongify_date(Sys.Date() - lubridate::days(epiwindow))

    # Use today's date/time (right now) as the maximum date:
    maxdate = mongify_date(Sys.time())

  } else if(datequery == "epibuffer"){

    # First convert the supplied min and max dates to date format:
    mindatef = lubridate::parse_date_time(x = mindate, orders = daterangeformat)
    maxdatef = lubridate::parse_date_time(x = maxdate, orders = daterangeformat)

    # Next calculate the min and max dates applying the epiwindow buffer:
    mindate = mongify_date(mindatef - lubridate::days(epiwindow))
    maxdate = mongify_date(maxdatef + lubridate::days(epiwindow))

  }


  ####################################
  # 02. Create json query:
  ####################################

  # Build the query with or without defining columns to return:
  if(cols2return == "identifiers"){

    # Build the query as an R list object:
    query_list <- list(filter =

                         # Add where clause:
                         list(where =

                                # Filter results by date range:
                                list(dateOfReporting = list(between =
                                                              c(mindate,
                                                                maxdate)),

                                     # Define format of column names and values:
                                     useDbColumns = "true",
                                     dontTranslateValues = "true",
                                     jsonReplaceUndefinedWithNull = "true"),

                              # Define columns to return:
                              fields = c("id",                # System case ID
                                         "visualId",          # Visible case ID
                                         "firstName",         # First name
                                         "lastName",          # Last name
                                         "dob",               # Birth date
                                         "age.years",         # Age (years)
                                         "documents.number",  # Document ID
                                         "dateOfReporting",   # Report date
                                         "dateOfOnset",       # Onset date
                                         "type")))            # Type is contact

  } else if(cols2return == "all"){

    # Build the query as an R list object:
    query_list <- list(filter =

                         # Add where clause:
                         list(where =

                                # Filter results by date range:
                                list(dateOfReporting = list(between =
                                                              c(mindate,
                                                                maxdate)),

                                     # Define format of column names and values:
                                     useDbColumns = "true",
                                     dontTranslateValues = "true",
                                     jsonReplaceUndefinedWithNull = "true")))

  }


  # Convert the query to json:
  query_json <- jsonlite::toJSON(x = query_list,
                                 # Do not indent or space out elements
                                 pretty = FALSE,
                                 # Do not enclose single values in square braces
                                 auto_unbox = TRUE)


  ####################################
  # 03. Get active outbreak ID:
  ####################################

  if(outbreak == "active"){

    # Get the active outbreak ID:
    outbreak_id = get_active_outbreak(url = url,
                                      username = username,
                                      password = password)

  } else {

    # Set outbreak ID to that supplied by user:
    outbreak_id = outbreak

  }


  ####################################
  # 04. Send query to Go.Data:
  ####################################

  # Create the case export request and fetch the export log ID:
  elid = httr::POST(url =

                      # Construct request API URL:
                      paste0(url,
                             "api/outbreaks/",
                             outbreak_id,
                             "/contacts/export?access_token=",
                             get_access_token(url = url,
                                              username = username,
                                              password = password)),
                    # Set the content type:
                    httr::content_type_json(),

                    # Add query:
                    body = query_json,
                    encode = "raw") %>%

    # Fetch content:
    httr::content() %>%

    # Extract export log ID from content:
    purrr::pluck("exportLogId")


  ####################################
  # 05. Wait for download to compile:
  ####################################

  # Check status of request periodically, until finished
  er_status = httr::GET(paste0(url,
                               "api/export-logs/",
                               elid,
                               "?access_token=",
                               get_access_token(url = url,
                                                username = username,
                                                password = password))) %>%
    # Extract content:
    content()

  # Subset content to extract necessary messages:
  er_status = er_status[c("statusStep",
                          "totalNo",
                          "processedNo")]

  # Set waiting time to allow download to complete:
  while(er_status$statusStep != "LNG_STATUS_STEP_EXPORT_FINISHED") {

    # Wait for request to complete:
    Sys.sleep(2)

    # Get export request status again:
    er_status = httr::GET(paste0(url,
                                 "api/export-logs/",
                                 elid,
                                 "?access_token=",
                                 get_access_token(url = url,
                                                  username = username,
                                                  password = password))) %>%
      # Extract content again:
      content()

    # Set user progress message:
    message(paste0("...processed ",
                   er_status$processedNo,
                   " of ",
                   er_status$totalNo, " records"))

  }

  ####################################
  # 06. Fetch query results:
  ####################################

  # Now import query results to R using export log ID from the previous step:
  contacts = httr::GET(url =
                      paste0(url,
                             "api/export-logs/",
                             elid,
                             "/download?access_token=",
                             get_access_token(url = url,
                                              username = username,
                                              password = password))) %>%

    # Fetch content of downloaded file:
    httr::content("text", encoding = "UTF-8") %>%

    # Convert json to flat data.frame:
    jsonlite::fromJSON(flatten = TRUE)

  #################################
  # Tidy up output table:

  # Check that at least one record is returned, format if so:
  if(!purrr::is_empty(contacts) & is.data.frame(contacts)){

    contacts = contacts %>%

      # Replace any NULL values with NA:
      dplyr::mutate(across(.cols = everything(),
                           .fns = null2na)) %>%

      # Unnest nested variables:
      tidyr::unnest(cols = documents,
                    names_sep = "_",
                    keep_empty = TRUE) %>%

      # Convert date columns from mongodb format to R POSIXct:
      dplyr::mutate(across(.cols = c(starts_with("date"), "dob"),
                           .fns = lubridate::ymd_hms)) %>%

      # Remove language token from person type:
      dplyr::mutate(type = tolower(gsub(
        pattern = "LNG_REFERENCE_DATA_CATEGORY_PERSON_TYPE_",
        replacement = "",
        x = type))) %>%

      # Rename columns:
      dplyr::rename_with( ~ gsub(pattern = ".",
                                 replacement = "_",
                                 x = .x,
                                 fixed = TRUE))


    # Check if documents_number col is present and rename column otherwise:
    if("documents" %in% names(contacts)){

      contacts = contacts %>%

        dplyr::rename(documents_number = documents)

    }

    # List of column names in final order:
    colorder <- c("_id",
                  "visualId",
                  "firstName",
                  "lastName",
                  "dob",
                  "age_years",
                  "documents_number",
                  "dateOfReporting",
                  "type")

    # Update order of columns:
    contacts = contacts %>%

      dplyr::mutate(documents_number = as.character(documents_number)) %>%

      dplyr::relocate(all_of(colorder))


  } else {

    contacts = "no matches"

  }


  ####################################
  # 07. Return cases to match on:
  ####################################

  # Return data.frame of filtered cases:
  return(contacts)

}