R/getQualtrics.R

Defines functions parse_dates_to_iso qualtrics.dict qualtrics.index qualtricsHarmonization connectQualtrics qualtrics

Documented in qualtrics qualtrics.dict qualtrics.index

#' Retrieve Survey Data from Qualtrics
#'
#' @param qualtrics_alias The alias for the Qualtrics survey to be retrieved.
#' @param institution Optional. The institution name (e.g., "temple" or "nu"). If NULL, all institutions will be searched.
#' @param label Logical indicating whether to return coded values or their associated labels (default is FALSE).
#' @param interview_date Optional; can be either:
#'        - A date string in various formats (ISO, US, etc.) to filter data up to that date
#'        - A boolean TRUE to return only rows with non-NA interview_date values
#' @return A cleaned and harmonized data frame containing the survey data with superkeys first.
#' @importFrom dplyr %>% select mutate
#' @export
#' @examples
#' \dontrun{
#' # Get survey by alias (will search all institutions)
#' survey_data <- qualtrics("rgpts")
#' }
qualtrics <- function(qualtrics_alias, institution = NULL, label = FALSE, interview_date = NULL) {
  # Load necessary source files
  
  # Validate config
  cfg <- validate_config("qualtrics")
  
  # Get secrets using get_secret() to keep it secret, keep it safe
  baseUrls <- get_secret("baseUrls")
  apiKeys <- get_secret("apiKeys")
  
  # Get survey ID
  survey_id <- NULL
  
  if (!is.null(institution)) {
    # Check if institution exists
    if (!(institution %in% names(cfg$qualtrics$survey_ids))) {
      stop(paste("Institution", institution, "not found in ./config.yml configuration."))
    }
    
    # Check if survey exists in specified institution
    if (!(qualtrics_alias %in% names(cfg$qualtrics$survey_ids[[institution]]))) {
      stop(paste("Survey", qualtrics_alias, "not found in institution", institution))
    }
    
    survey_id <- cfg$qualtrics$survey_ids[[institution]][[qualtrics_alias]]
  } else {
    # Search all institutions
    found <- FALSE
    for (inst in names(cfg$qualtrics$survey_ids)) {
      if (qualtrics_alias %in% names(cfg$qualtrics$survey_ids[[inst]])) {
        survey_id <- cfg$qualtrics$survey_ids[[inst]][[qualtrics_alias]]
        institution <- inst
        found <- TRUE
        break
      }
    }
    
    if (!found) {
      stop(sprintf("Qualtrics survey '%s' not found in any institution.", qualtrics_alias))
    }
  }
  
  message(sprintf("Retrieving '%s' survey from %s Qualtrics...", qualtrics_alias, institution))
  
  # Connect to Qualtrics
  connectQualtrics()
  
  # Show loading animation (if implemented)
  if (exists("show_loading_animation")) {
    show_loading_animation()
  }
  
  # Fetch the data
  df <- qualtRics::fetch_survey(
    surveyID = survey_id,
    verbose = FALSE,
    label = label,
    convert = label,
    add_column_map = TRUE
  )
  
  if (!is.data.frame(df)) {
    stop(paste("fetch_survey did not return a data frame for", qualtrics_alias))
  }
  
  # Get identifier from config
  identifier <- cfg$identifier
  
  # Create a copy of the original dataframe to preserve original values
  original_df <- df
  
  # Advanced date parsing function that handles multiple formats
  parseAnyDate <- function(date_string) {
    if (is.na(date_string) || is.null(date_string)) {
      return(NA)
    }
    
    # Try multiple date formats sequentially
    date <- NULL
    
    # Try ISO format (YYYY-MM-DD)
    if (grepl("^\\d{4}-\\d{1,2}-\\d{1,2}$", date_string)) {
      date <- tryCatch(ymd(date_string), error = function(e) NULL)
    } 
    # Try US format (MM/DD/YYYY)
    else if (grepl("^\\d{1,2}/\\d{1,2}/\\d{4}$", date_string)) {
      date <- tryCatch(mdy(date_string), error = function(e) NULL)
    } 
    # Try European format (DD.MM.YYYY)
    else if (grepl("^\\d{1,2}\\.\\d{1,2}\\.\\d{4}$", date_string)) {
      date <- tryCatch(dmy(date_string), error = function(e) NULL)
    }
    # Try Canadian format (YYYY/MM/DD)
    else if (grepl("^\\d{4}/\\d{1,2}/\\d{1,2}$", date_string)) {
      date <- tryCatch(ymd(date_string), error = function(e) NULL)
    }
    # Try other format (DD-MM-YYYY)
    else if (grepl("^\\d{1,2}-\\d{1,2}-\\d{4}$", date_string)) {
      date <- tryCatch(dmy(date_string), error = function(e) NULL)
    }
    # Try abbreviated month name (15-Jan-2023 or Jan 15, 2023)
    else if (grepl("[A-Za-z]", date_string)) {
      date <- tryCatch(parse_date_time(date_string, c("dmy", "mdy")), error = function(e) NULL)
    }
    
    # If all attempts fail, return NA
    if (is.null(date) || all(is.na(date))) {
      warning("Failed to parse date: ", date_string, ". Treating as NA.")
      return(NA)
    }
    
    return(as.Date(date))
  }
  
  # Handle interview_date filtering
  if ("interview_date" %in% names(df)) {
    # Create a temporary date column for filtering but don't modify the original
    df$temp_date <- sapply(df$interview_date, parseAnyDate)
    
    # Handle the interview_date parameter
    if (!is.null(interview_date)) {
      if (is.logical(interview_date) && interview_date == TRUE) {
        # Keep only rows with non-NA interview_date values
        rows_to_keep <- !is.na(df$temp_date)
        df <- df[rows_to_keep, ]
        original_df <- original_df[rows_to_keep, ]
      } else if (is.character(interview_date) || inherits(interview_date, "Date")) {
        # Filter by specific date
        input_date <- tryCatch({
          if (inherits(interview_date, "Date")) {
            interview_date
          } else {
            parseAnyDate(interview_date)
          }
        }, error = function(e) {
          stop("Failed to parse interview_date parameter: ", interview_date)
        })
        
        if (is.na(input_date)) {
          stop("Failed to parse interview_date parameter: ", interview_date)
        }
        
        rows_to_keep <- df$temp_date <= input_date
        df <- df[rows_to_keep, ]
        original_df <- original_df[rows_to_keep, ]
      } else {
        stop("interview_date must be either a date string or TRUE")
      }
    }
    
    # Remove the temporary date column
    df$temp_date <- NULL
  }
  
  # Harmonize the data
  clean_df <- qualtricsHarmonization(df, identifier, qualtrics_alias)
  
  # List of allowed superkey columns to prioritize
  allowed_superkey_cols <- c(
    "record_id",
    "src_subject_id",
    "subjectkey",
    "site",
    "subsiteid",
    "sex",
    "race",
    "ethnic_group",
    "phenotype",
    "phenotype_description",
    "state",
    "status",
    "lost_to_followup",
    "lost_to_follow-up",
    "twins_study",
    "sibling_study",
    "family_study",
    "sample_taken",
    "interview_date",
    "interview_age",
    "visit",
    "week"
  )
  
  # Reorder columns to have superkeys first
  if (is.data.frame(clean_df) && ncol(clean_df) > 0) {
    # Identify which superkey columns are actually in the data
    present_superkeys <- intersect(allowed_superkey_cols, names(clean_df))
    
    # Get all other columns (non-superkeys)
    other_cols <- setdiff(names(clean_df), present_superkeys)
    
    # If there are matching superkeys, reorder the columns
    if (length(present_superkeys) > 0) {
      # Create new column order with superkeys first, then other columns
      new_order <- c(present_superkeys, other_cols)
      
      # Reorder the dataframe
      clean_df <- clean_df[, new_order, drop = FALSE]
    }
  }
  
  return(clean_df)
}

#################
## Helper Functions
#################

#' Connect to Qualtrics API
#'
#' This helper function sets up the connection to the Qualtrics API using credentials stored in a file or environment variables.
#' It is called internally by the 'qualtrics' function.
#'
#' @importFrom config get
#' @import qualtRics
#' @noRd
connectQualtrics <- function() {
  # Validate secrets
  validate_secrets("qualtrics")
  
  # Get secrets using get_secret() to keep it secret, keep it safe
  baseUrls <- get_secret("baseUrls")
  apiKeys <- get_secret("apiKeys")
  
  
  if (!exists("apiKeys") || !exists("baseUrls")) {
    stop("apiKeys and/or baseUrls arrays not found in secrets.R")
  }
  
  if (length(apiKeys) != length(baseUrls)) {
    stop("apiKeys and baseUrls arrays must have the same length.")
  }
  
  # Suppress messages about .Renviron
  suppressMessages({
    for (i in seq_along(apiKeys)) {
      tryCatch({
        # Set credentials and load environment manually to avoid restart message
        result <- qualtRics::qualtrics_api_credentials(
          api_key = apiKeys[i],
          base_url = baseUrls[i],
          install = TRUE,
          overwrite = TRUE
        )
        
        # If credentials were set successfully, also read them into current session
        if (file.exists("~/.Renviron")) {
          readRenviron("~/.Renviron")
        }
        
        return(TRUE)
      }, error = function(e) {
        if (i == length(apiKeys)) {
          stop("Failed to connect with any credentials provided in ./secrets.R")
        }
      })
    }
  })
}

#' Harmonize Data
#'
#' Performs data cleaning and harmonization on the fetched Qualtrics survey data.
#'
#' @param df Data frame containing Qualtrics survey data.
#' @param identifier The unique identifier for survey respondents.
#' @param qualtrics_alias The alias for the Qualtrics survey.
#' @return Harmonized data frame.
#' @importFrom dplyr mutate
#' @noRd
qualtricsHarmonization <- function(df, identifier, qualtrics_alias) {
  if (!is.data.frame(df)) {
    stop("Input to qualtricsHarmonization is not a data frame.")
  }
  
  # Validate config
  cfg <- validate_config("qualtrics")
  
  # Check for visit variable, if not add baseline
  `%!in%` <- Negate(`%in%`)
  if ("visit" %!in% colnames(df) && cfg$study_alias == 'capr') {
    df$visit <- "bl"
  }
  
  # If visit variable exists, standardize values
  if ("visit" %in% colnames(df) && cfg$study_alias == 'capr') {
    df$visit <- ifelse(is.na(df$visit), "bl", 
                       ifelse(df$visit == "0", "bl", 
                              ifelse(df$visit == "12", "12m", 
                                     ifelse(df$visit == "24", "24m", df$visit))))
  }
  
  # Additional processing can be uncommented and modified as needed
  # df$src_subject_id <- as.numeric(df$src_subject_id)
  # df$interview_date <- as.Date(df$interview_date, "%m/%d/%Y")
  # df$measure <- qualtrics_alias
  
  # convert dates (from string ("m/d/Y") to iso date format)
  if ("interview_date" %in% colnames(df)) {
    df$interview_date <- parse_dates_to_iso(df$interview_date, "interview_date")
  }
  
  suppressWarnings(return(df))
}

#' Display table of available Qualtrics surveys 
#'
#' Retrieves a list of all available surveys in the configured Qualtrics account.
#'
#' @param institution Optional; the institution identifier to use. If NULL, uses all
#'   institutions specified in the configuration file.
#'
#' @return A data frame containing the IDs and names of all available surveys
#'   in the configured Qualtrics account. Can be used to identify surveys for
#'   further data retrieval.
#'   
#' @export
qualtrics.index <- function(institution = NULL) {
  # Temporarily suppress warnings
  old_warn <- options("warn")

  tryCatch({
    # Load necessary source files for helper functions
    
    # Load required secrets and configuration
    validate_secrets("qualtrics")
    
    # Get secrets using get_secret() to keep it secret, keep it safe
    baseUrls <- get_secret("baseUrls")
    apiKeys <- get_secret("apiKeys")
    
    cfg <- validate_config("qualtrics")
    
    # Connect to Qualtrics using the existing helper function
    connectQualtrics()
    
    # Get all surveys
    message("Fetching available Qualtrics surveys...")
    surveys <- qualtRics::all_surveys()
    
    # Filter by institution if specified
    if (!is.null(institution) && !is.null(cfg$qualtrics$survey_ids)) {
      if (institution %in% names(cfg$qualtrics$survey_ids)) {
        # Extract the survey IDs for the specified institution
        inst_surveys <- cfg$qualtrics$survey_ids[[institution]]
        
        # Create a mapping of configured surveys for this institution
        configured_surveys <- data.frame(
          id = unlist(inst_surveys),
          alias = names(inst_surveys),
          stringsAsFactors = FALSE
        )
        
        # Filter and merge with alias information
        surveys <- merge(surveys, configured_surveys, by = "id", all.y = TRUE)
        message(paste0("Filtered to ", nrow(surveys), " surveys from institution '", institution, "'"))
      } else {
        warning(paste0("Institution '", institution, "' not found in configuration. Returning all surveys."))
      }
    } else if (!is.null(cfg$qualtrics$survey_ids)) {
      # Create a complete mapping of all configured surveys across institutions
      all_mapped_surveys <- data.frame(
        id = character(0),
        alias = character(0),
        institution = character(0),
        stringsAsFactors = FALSE
      )
      
      for (inst in names(cfg$qualtrics$survey_ids)) {
        inst_surveys <- cfg$qualtrics$survey_ids[[inst]]
        if (length(inst_surveys) > 0) {
          inst_df <- data.frame(
            id = unlist(inst_surveys),
            alias = names(inst_surveys),
            institution = rep(inst, length(inst_surveys)),
            stringsAsFactors = FALSE
          )
          all_mapped_surveys <- rbind(all_mapped_surveys, inst_df)
        }
      }
      
      # Merge with the surveys data
      if (nrow(all_mapped_surveys) > 0) {
        surveys <- merge(surveys, all_mapped_surveys, by = "id", all = TRUE)
      }
    }
    
    # Format the output
    if (nrow(surveys) > 0) {
      # Sort by name for easier reading
      surveys <- surveys[order(surveys$name), ]
      
      # Create output table
      if ("alias" %in% colnames(surveys)) {
        if ("institution" %in% colnames(surveys)) {
          # Full output with institution information
          result <- data.frame(
            ID = surveys$id,
            Alias = surveys$alias,
            Institution = surveys$institution,
            Name = surveys$name,
            Last_Modified = surveys$lastModified,
            stringsAsFactors = FALSE
          )
        } else {
          # Output for specific institution
          result <- data.frame(
            ID = surveys$id,
            Alias = surveys$alias,
            Name = surveys$name,
            Last_Modified = surveys$lastModified,
            stringsAsFactors = FALSE
          )
        }
      } else {
        # Basic output for unconfigured surveys
        result <- data.frame(
          ID = surveys$id,
          Name = surveys$name,
          Last_Modified = surveys$lastModified,
          stringsAsFactors = FALSE
        )
      }
      
      # Print the results in a user-friendly format
      message(paste0("\nFound ", nrow(result), " Qualtrics surveys:"))
      print(result, row.names = TRUE)
      
      # Restore previous warning setting
      options(old_warn)
      
      return(invisible(surveys))
    } else {
      message("No surveys found.")
      
      # Restore previous warning setting
      options(old_warn)
      
      return(invisible(NULL))
    }
  }, error = function(e) {
    # Restore previous warning setting before stopping
    options(old_warn)
    
    stop(paste("Error connecting to Qualtrics:", e$message))
  })
}

#' Fetch Qualtrics survey metadata to be stored in data frame
#'
#' This function extracts column mappings from the metadata of a Qualtrics survey data frame.
#' It can accept either a data frame containing Qualtrics data, a variable name as string,
#' or a survey alias string.
#'
#' @param survey_alias Can either be an existing dataframe, variable name as string, or survey alias string
#' @param exclude_embedded Only select QIDs
#' @return A list containing the mappings of column names to survey questions.
#' @export
qualtrics.dict <- function(survey_alias, exclude_embedded = TRUE) {
  # First handle the case of a non-existent variable being passed without quotes
  var_name <- NULL
  
  # Only try to get the name if survey_alias is missing
  if (missing(survey_alias)) {
    stop("Survey alias is required")
  }
  
  # Capture the actual call
  call_expr <- substitute(survey_alias)
  
  # Check if it's a symbol (variable name) that doesn't exist
  if (is.symbol(call_expr) && !exists(as.character(call_expr))) {
    var_name <- as.character(call_expr)
    message(sprintf("Object '%s' not found, using as survey alias instead.", var_name))
    survey_alias <- var_name
  }
  
  # Now proceed with normal function logic
  
  # Check if input is a data frame
  if (is.data.frame(survey_alias)) {
    # Input is already a data frame, use it directly
    colmap <- qualtRics::extract_colmap(respdata = survey_alias)
    
    # Filter to include only QID fields
    if (exclude_embedded && !is.null(colmap)) {
      if ("ImportId" %in% names(colmap)) {
        colmap <- colmap[!is.na(colmap$ImportId) & grepl("^QID", colmap$ImportId), ]
      } else if ("qid" %in% names(colmap)) {
        colmap <- colmap[!is.na(colmap$qid) & grepl("^QID", colmap$qid), ]
      }
    }
    
    return(colmap)
  }
  
  # Input is a string
  if (is.character(survey_alias)) {
    # First, check if it's a variable name in the global environment
    if (exists(survey_alias)) {
      var_data <- base::get(survey_alias)
      
      # Check if the variable is a data frame
      if (is.data.frame(var_data)) {
        message(sprintf("Using existing data frame '%s' from environment.", survey_alias))
        colmap <- qualtRics::extract_colmap(respdata = var_data)
        
        # Filter to include only QID fields
        if (exclude_embedded && !is.null(colmap)) {
          if ("ImportId" %in% names(colmap)) {
            colmap <- colmap[!is.na(colmap$ImportId) & grepl("^QID", colmap$ImportId), ]
          } else if ("qid" %in% names(colmap)) {
            colmap <- colmap[!is.na(colmap$qid) & grepl("^QID", colmap$qid), ]
          }
        }
        
        return(colmap)
      }
    }
    
    # Not a variable or not a data frame, treat as survey alias
    message(sprintf("Fetching dictionary for alias '%s' from Qualtrics.", survey_alias))
    
    # Temporarily suppress warnings and disable progress bars
    old_warn <- options("warn")
    old_opt <- options(qualtRics.progress = FALSE)
    on.exit({options(old_warn); options(old_opt)}, add = TRUE)

    # Get survey data with suppressed output
    survey_data <- suppressMessages(wizaRdry::qualtrics(survey_alias))
    colmap <- qualtRics::extract_colmap(respdata = survey_data)
    
    # Filter to include only QID fields
    if (exclude_embedded && !is.null(colmap)) {
      if ("ImportId" %in% names(colmap)) {
        colmap <- colmap[!is.na(colmap$ImportId) & grepl("^QID", colmap$ImportId), ]
      } else if ("qid" %in% names(colmap)) {
        colmap <- colmap[!is.na(colmap$qid) & grepl("^QID", colmap$qid), ]
      }
    }
    
    return(colmap)
  }
  
  # Invalid input type
  stop("Input must be either a data frame or a string (survey alias or variable name).")
}

#' Convert dates to ISO format robustly
#'
#' This function attempts to intelligently parse dates in various formats
#' and convert them to ISO format (YYYY-MM-DD).
#'
#' @param date_vector A vector of date strings to be parsed
#' @param column_name The name of the column being parsed (for error messages)
#' @return A Date vector in ISO format (YYYY-MM-DD)
#' @importFrom lubridate parse_date_time
#' @noRd
parse_dates_to_iso <- function(date_vector, column_name = "date") {
  if (is.null(date_vector) || length(date_vector) == 0) {
    return(date_vector)
  }
  
  # Skip if already in Date format
  if (inherits(date_vector, "Date")) {
    return(date_vector)
  }
  
  # If already a POSIXct or POSIXlt, convert to Date
  if (inherits(date_vector, "POSIXt")) {
    return(as.Date(date_vector))
  }
  
  # Convert to character if not already
  date_vector <- as.character(date_vector)
  
  # Remove any NA values for analysis
  non_na_dates <- date_vector[!is.na(date_vector) & date_vector != ""]
  
  if (length(non_na_dates) == 0) {
    # All NA or empty, just return a vector of NAs
    return(as.Date(date_vector))
  }
  
  # Define a set of possible date formats to try
  possible_formats <- c(
    # American formats
    "mdy", "mdY", "m/d/y", "m/d/Y", "m-d-y", "m-d-Y",
    # European/ISO formats
    "ymd", "Ymd", "y/m/d", "Y/m/d", "y-m-d", "Y-m-d",
    # Other common formats
    "dmy", "dmY", "d/m/y", "d/m/Y", "d-m-y", "d-m-Y",
    # Month name formats
    "mdy_b", "mdY_b", "b_d_y", "b_d_Y", 
    "dmy_b", "dmY_b", "d_b_y", "d_b_Y", 
    "ymd_b", "Ymd_b", "y_b_d", "Y_b_d"
  )
  
  # Try to detect the date format
  tryCatch({
    # Sample the first few non-NA dates to guess format
    sample_size <- min(100, length(non_na_dates))
    sample_dates <- non_na_dates[1:sample_size]
    
    # Try parsing with each format and keep track of success rate
    format_success <- numeric(length(possible_formats))
    
    for (i in seq_along(possible_formats)) {
      parsed_dates <- suppressWarnings(
        lubridate::parse_date_time(sample_dates, possible_formats[i], quiet = TRUE)
      )
      format_success[i] <- sum(!is.na(parsed_dates)) / length(sample_dates)
    }
    
    # Find the format with the highest success rate
    best_format_idx <- which.max(format_success)
    best_format <- possible_formats[best_format_idx]
    
    # If the best format doesn't parse at least 50% of dates, try combo of top formats
    if (format_success[best_format_idx] < 0.5) {
      # Get top 3 formats
      top_formats <- possible_formats[order(format_success, decreasing = TRUE)[1:3]]
      
      # Try parsing with these formats
      parsed_dates <- suppressWarnings(
        lubridate::parse_date_time(date_vector, top_formats, quiet = TRUE)
      )
    } else {
      # Parse all dates with the best format
      parsed_dates <- suppressWarnings(
        lubridate::parse_date_time(date_vector, best_format, quiet = TRUE)
      )
    }
    
    # Convert to Date class
    result <- as.Date(parsed_dates)
    
    # Basic validation: check for impossibly old dates (before 1900) or future dates
    result[result < as.Date("1900-01-01") | result > Sys.Date() + 30] <- NA
    
    # Log stats about parsing
    success_rate <- sum(!is.na(result)) / length(date_vector) * 100
    message(sprintf("Parsed %s: %.1f%% successful using %s format", 
                    column_name, success_rate, 
                    ifelse(format_success[best_format_idx] < 0.5, 
                           paste(top_formats, collapse=", "), best_format)))
    
    return(result)
  }, error = function(e) {
    # Fallback: try base R's as.Date with common formats
    warning(sprintf("Advanced date parsing failed for %s: %s. Falling back to basic parsing.",
                    column_name, e$message))
    
    fallback_formats <- c("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%Y/%m/%d")
    for (fmt in fallback_formats) {
      parsed <- suppressWarnings(as.Date(date_vector, format = fmt))
      if (sum(!is.na(parsed)) / length(parsed) > 0.5) {
        message(sprintf("Basic parsing of %s succeeded with format: %s", column_name, fmt))
        return(parsed)
      }
    }
    
    # If all else fails, return NA
    warning(sprintf("All date parsing methods failed for %s", column_name))
    return(as.Date(rep(NA, length(date_vector))))
  })
}

#' Alias for 'qualtrics'
#'
#' This is a legacy alias for the 'qualtrics' function to maintain compatibility with older code.
#'
#' @inheritParams qualtrics
#' @inherit qualtrics return
#' @export
#' @examples
#' \dontrun{
#' survey_data <- getSurvey("your_survey_alias")
#' }
getSurvey <- qualtrics

Try the wizaRdry package in your browser

Any scripts or data that you put into this service are public.

wizaRdry documentation built on June 8, 2025, 11:30 a.m.