Nothing
#' Function to query allofus observation table for survey responses
#'
#' @description Extracts survey responses in a tidy format that also includes
#' ‘skip’ responses and collapses across all versions of the person health /
#' personal medical history surveys. Currently responses in the ‘ds_survey’
#' table omit skipped responses. Responses are returned as Yes" if the
#' respondent answered that the individual had the condition, No" if the
#' respondent answered that the individual did not have that condition (or
#' omitted it when selecting from related conditions), a skip response if the
#' question was skipped, and NA if the respondent did not answer the question.
#' Returns a data frame or SQL tbl with the initial cohort table along with a
#' column for each question included in questions and answers foreach
#' person_id in the cells. To find the desired survey questions, use the all
#' of us data dictionary, survey codebook, Athena, data browser, or the
#' modified codebook which can be found in the allofus R package.
#'
#' @details The function will return a dataframe or SQL tbl with the initial
#' cohort table along with a column for each question included in `questions`
#' and answers for each person_id in the cells. The column names (questions)
#' can be returned as the concept_code or concept_id or by providing new
#' column names. For each question, a column with the suffix "_date" is
#' included with the date on which the question was answered. When questions
#' can have multiple answers ("checkbox"-style questions), answers are
#' returned as a comma-separated string.
#'
#' To find the desired survey questions, use the all of us data dictionary,
#' survey codebook, athena, data browser, or the allofus R package modified
#' codebook which can be found here:
#' https://roux-ohdsi.github.io/allofus/articles/searchable_codebook.html For
#' questions regarding an individual's health history or family health
#' history, the function requires the specific concept_id (or concept_code)
#' for individual in question, whether that is "self" or another relative.
#' Responses are returned as "Yes" if the respondent answered that the
#' individual had the condition, "No" if the respondent answered that the
#' individual did not have that condition (or omitted it when selecting from
#' related conditions), a skip response if the question was skipped, and NA if
#' the respondent did not answer the question.
#'
#' @param cohort Reference to a remote table or local dataframe with a column
#' called "person_id"
#' @param questions either a vector of concept_ids or concept_codes for
#' questions to return results
#' @param question_output how to name the columns. Options include as the text
#' of the concept code ("concept_code"), as concept ids preceded by "x_"
#' ("concept_id"), or using a custom vector of column names matching the
#' vector of `questions`. Defaults to "concept_code".
#' @param clean_answers whether to clean the answers to the survey questions.
#' Defaults to TRUE.
#' @param con connection to the allofus SQL database. Defaults to
#' getOption("aou.default.con"), which is created automatically with
#' `aou_connect()`
#' @param collect Whether to bring the resulting table into local memory
#' (`collect = TRUE`) as a dataframe or leave as a reference to a database
#' table (for continued analysis using, e.g., `dbplyr`). Defaults to `FALSE.`
#' @param ... additional arguments passed to `collect()` when `collect = TRUE`
#' @return A dataframe if `collect = TRUE`; a reference to a remote database
#' table if not.
#' @export
#' @examplesIf on_workbench()
#'
#' con <- aou_connect()
#' cohort <- dplyr::tbl(con, "person") %>%
#' dplyr::filter(person_id > 5000000) %>%
#' dplyr::select(person_id, year_of_birth, gender_concept_id)
#'
#' aou_survey(
#' cohort,
#' questions = c(1585375, 1586135),
#' question_output = "concept_code"
#' )
#' aou_survey(
#' cohort,
#' questions = c(1585811, 1585386),
#' question_output = c("pregnancy", "insurance")
#' )
#' aou_survey(
#' cohort,
#' questions = c(1585375, 1586135, 1740719, 43529932),
#' question_output = c("income", "birthplace", "grandpa_bowel_obstruction", "t2dm"),
#' collect = FALSE
#' )
#'
#' aou_survey(cohort,
#' questions = 1384452,
#' question_output = "osteoarthritis"
#' ) %>%
#' dplyr::count(osteoarthritis)
#'
aou_survey <- function(cohort = NULL,
questions,
question_output = "concept_code",
clean_answers = TRUE,
collect = FALSE,
...,
con = getOption("aou.default.con")) {
# check for connection
if (is.null(con)) {
cli::cli_abort(c("No connection available.",
"i" = "Provide a connection automatically by running {.code aou_connect()} before this function.",
"i" = "You can also provide {.code con} as an argument or default with {.code options(aou.default.con = ...)}."
))
}
# check for NA values or empty strings in questions or question_output
if (any(is.na(c(questions, question_output))) | any(c(questions, question_output) == "")) {
cli::cli_abort("NA values or empty string detected in {.code questions} or {.code question_output}")
}
if (length(question_output) == length(questions)) { # either gave column names or happen to have a single question
if (length(question_output) == 1) { # gave one column name or "concept_code" or "concept_id"
# see if the matched argument matches the actual argument
question_output_arg <- tryCatch(match.arg(question_output, c("concept_code", "concept_id")), error = function(e) question_output)
# if not, print a message so that the user can spot their problem
if (!identical(question_output, question_output_arg)) {
cli::cli_inform("Using {question_output} as column name.",
">" = "Did you really mean {question_output_arg}?"
)
question_output_arg <- question_output
}
} else {
question_output_arg <- question_output
}
# didn't give the right number question_output and it doesn't match one of the options
} else if (is.null(tryCatch(match.arg(question_output, c("concept_code", "concept_id")), error = function(e) NULL))) {
cli::cli_abort(c("Length of argument {.code question_output} doesn't match {.code questions}.",
"i" = "Provide a vector of column names the same length as {.code questions}.",
"i" = "Alternatively, set either {.code question_output = \"concept_code\"} or {.code question_output = \"concept_id\"}"
))
# gave one value for question_output and multiple values for questions
} else {
question_output_arg <- match.arg(question_output, c("concept_code", "concept_id"))
}
question_output <- if (length(question_output_arg) == 1 && question_output_arg[1] == "concept_id") "concept_id" else "value"
if (is.null(cohort)) {
cli::cli_warn(c("No cohort provided.", ">" = "Pulling survey data for entire All of Us cohort."))
function_cohort <- dplyr::tbl(con, "person") %>% dplyr::select("person_id")
} else if (!"person_id" %in% colnames(cohort)) {
# ensure person_id is a column name in cohort
cli::cli_abort(c("{.code person_id} column not found in cohort.",
"i" = "Confirm that the cohort has a column named {.code person_id}"
))
} else if (is.data.frame(cohort)) {
function_cohort <- dplyr::tbl(con, "person") %>%
dplyr::filter(.data$person_id %in% !!unique(cohort$person_id)) %>%
dplyr::select("person_id")
} else {
function_cohort <- cohort %>%
dplyr::distinct(.data$person_id)
}
# pivot longer to inclde the rx, on_txt, and age_diagnosis columns
aou_health_history_long <- allofus::aou_health_history %>%
dplyr::filter(.data$relative == "self") %>%
dplyr::rename(concept_code_overall = "concept_code") %>%
tidyr::pivot_longer(
cols = c(
"concept_id_rx_meds", "concept_id_on_txt", "concept_id_age_diagnosis",
"concept_code_rx_meds", "concept_code_on_txt", "concept_code_age_diagnosis"
),
names_pattern = "(.+_.+)_(.+_.+)",
names_to = c(".value", "question_sub"), values_to = c("concept_id_sub")
) %>%
dplyr::mutate(
concept_id_for_sub = .data$concept_id_specific,
concept_id_specific = .data$concept_id
) %>%
dplyr::distinct(
.data$question, .data$relative, .data$condition, .data$category, .data$concept_code, .data$concept_code_overall, .data$concept_id_specific,
.data$concept_id_for_sub
) %>%
dplyr::bind_rows(allofus::aou_health_history)
# branching logic for how to handle concept_id vs. concept_code question inputs
if (is.character(questions)) {
# if its a character vector, go find the matching concept_ids because thats much faster
# and easier to search with %in%
regular_survey_qs <- questions[questions %in% allofus::aou_codebook$concept_code]
health_survey_qs <- questions[questions %in% aou_health_history_long$concept_code]
# did we account for everything?
missing_qs <- questions[!questions %in% c(regular_survey_qs, health_survey_qs)]
if (length(missing_qs) > 0) {
cli::cli_abort(c(paste("Concept codes", paste(missing_qs, collapse = ", "), "not found in codebook."),
"i" = "Check spelling and confirm that concept codes appear in the codebook at {.url https://roux-ohdsi.github.io/allofus/articles/searchable_codebook.html}."
))
}
# get the concept_ids for the regular survey questions
regular_survey_concept_ids <- allofus::aou_codebook %>%
dplyr::filter(.data$concept_code %in% regular_survey_qs) %>%
dplyr::pull("concept_id") %>%
unique()
health_survey_concept_ids <- aou_health_history_long %>%
dplyr::filter(.data$concept_code %in% health_survey_qs) %>%
dplyr::pull("concept_id_specific")
# there are some concept_ids in the regular codebook that are there for reference
# not to search -- make sure those are not included
too_general <- regular_survey_concept_ids[regular_survey_concept_ids %in% aou_health_history_long$concept_id_overall]
if (length(too_general) > 0) {
too_general <- regular_survey_qs[regular_survey_concept_ids %in% aou_health_history_long$concept_id_overall]
cli::cli_abort(c(paste("Concept code(s) ", paste0(too_general, collapse = ", "), "is/are too general."),
"i" = "Health history codes must refer to a specific condition and person pairing.",
"i" = "Look for a specific condition in the health history codebook at {.url https://roux-ohdsi.github.io/allofus/articles/searchable_codebook.html}."
))
}
names_for_lookup <- dplyr::tibble(concept_code = questions, cn = !!question_output_arg)
} else {
# if its already numeric, just look
regular_survey_concept_ids <- questions[questions %in% allofus::aou_codebook$concept_id]
health_survey_concept_ids <- questions[questions %in% aou_health_history_long$concept_id_specific]
# did we account for everything?
missing_qs <- questions[!questions %in% c(regular_survey_concept_ids, health_survey_concept_ids)]
if (length(missing_qs) > 0) {
# check to see if any are in the health history codebook as overall concept ids
too_general <- missing_qs[missing_qs %in% aou_health_history_long$concept_id_overall]
if (length(too_general) > 0) {
cli::cli_abort(c(paste("Concept ID(s) ", paste0(too_general, collapse = ", "), "is/are too general."),
"i" = "Health history codes must refer to a specific condition and person pairing.",
"i" = "Look for a specific condition in the health history codebook at {.url https://roux-ohdsi.github.io/allofus/articles/searchable_codebook.html}."
))
} else {
cli::cli_abort(c(paste("Concept ids", paste(missing_qs, collapse = ", "), "not found in codebook."),
"i" = "Confirm that concept codes appear in the codebook at {.url https://roux-ohdsi.github.io/allofus/articles/searchable_codebook.html}."
))
}
}
names_for_lookup <- dplyr::tibble(concept_id = questions, cn = !!question_output_arg)
}
if (length(health_survey_concept_ids) > 0) {
# are there any that are the meds/treatment/age questions?
sub_questions <- aou_health_history_long %>%
dplyr::filter(.data$concept_id_specific %in% health_survey_concept_ids) %>%
dplyr::filter(!is.na(.data$concept_id_for_sub))
if (nrow(sub_questions) > 0) {
# if there are some sub questions without the parents
if (!all(sub_questions$concept_id_for_sub %in% health_survey_concept_ids)) {
missing_qs <- sub_questions %>%
dplyr::filter(!.data$concept_id_for_sub %in% health_survey_concept_ids)
new_cols <- if (question_output == "concept_id") {
paste0("`x", missing_qs$concept_id_for_sub, "`")
} else {
paste0("`", missing_qs$concept_code_overall, "`")
}
cli::cli_inform(
c(
"i" = "One or more of the requested questions were only asked of people who responded that they had certain conditions. ",
">" = "The top-level question(s) will be added to the output to provide context about missing data as column(s) {paste0(new_cols, collapse = ', ')}."
)
)
# health_survey_concept_ids <- c(health_survey_concept_ids, missing_qs$concept_id_for_sub)
names_for_lookup <- dplyr::bind_rows(
names_for_lookup,
dplyr::tibble(
concept_id = missing_qs$concept_id_for_sub,
concept_code = missing_qs$concept_code_overall,
cn = missing_qs$concept_code_overall,
type = "health"
)
)
}
}
}
regular_survey_concept_codes <- allofus::aou_codebook %>%
dplyr::filter(.data$concept_id %in% regular_survey_concept_ids) %>%
dplyr::distinct(.data$concept_code, .data$concept_id) %>%
dplyr::mutate(type = "regular")
health_survey_concept_codes <- aou_health_history_long %>%
dplyr::filter(.data$concept_id_specific %in% health_survey_concept_ids) %>%
dplyr::select("concept_code", concept_id = "concept_id_specific") %>%
dplyr::distinct(.data$concept_code, .data$concept_id) %>%
dplyr::mutate(type = "health")
# there are more graceful ways to merge this data but ignoring that...
suppressMessages({
concept_lookup <- dplyr::bind_rows(regular_survey_concept_codes, health_survey_concept_codes) %>%
dplyr::full_join(names_for_lookup) %>%
dplyr::group_by(.data$concept_id) %>%
tidyr::fill(dplyr::everything(), .direction = "up") %>%
dplyr::slice(1) %>%
dplyr::ungroup()
})
all_health_survey_concept_ids <- dplyr::filter(concept_lookup, .data$type == "health") %>%
dplyr::pull("concept_id")
health_survey_concept_ids <- all_health_survey_concept_ids[all_health_survey_concept_ids %in% allofus::aou_health_history$concept_id_specific]
# the sub questions for the conditions can just be treated like regular survey questions
regular_survey_concept_ids <- c(
regular_survey_concept_ids,
all_health_survey_concept_ids[!all_health_survey_concept_ids %in%
allofus::aou_health_history$concept_id_specific]
)
if (length(health_survey_concept_ids) > 0) {
all_health <- purrr::map(health_survey_concept_ids, ~ {
specific_concept_id <- .x
# this will be what the column is called
if (question_output_arg[1] == "concept_id") {
condition_name <- paste0("x", specific_concept_id)
} else if (question_output_arg[1] == "concept_code") {
condition_name <- concept_lookup %>%
dplyr::filter(.data$concept_id == specific_concept_id) %>%
dplyr::pull("concept_code")
} else {
condition_name <- concept_lookup %>%
dplyr::filter(.data$concept_id == specific_concept_id) %>%
dplyr::pull("cn")
}
condition_date <- paste0(condition_name, "_date")
osci_specific <- aou_health_history_long %>%
dplyr::filter(.data$concept_id_specific == specific_concept_id) %>%
dplyr::pull("concept_id_question") %>%
unique()
osci_overall <- allofus::aou_health_history %>%
dplyr::filter(.data$concept_id_specific == specific_concept_id) %>%
dplyr::pull("concept_id_overall") %>%
unique()
if (length(osci_specific) == 0) {
cli::cli_abort(c(paste("Concept id ", specific_concept_id, "is too general."),
"i" = "Health history codes must refer to a specific condition and person pairing.",
"i" = "Look for a specific condition in the health history codebook at {.url https://roux-ohdsi.github.io/allofus/articles/searchable_codebook.html}."
))
}
if (length(osci_specific) == 1 & !is.na(osci_overall)) { # this is not the case if an infectious disease question
cli::cli_inform(c("i" = "The question associated with concept id {specific_concept_id} was added to the later version of the family health history survey so earlier All of Us participants may not have answered it."))
}
obs <- dplyr::tbl(con, "observation") %>%
dplyr::inner_join(dplyr::select(function_cohort, "person_id"), by = "person_id") %>%
dplyr::filter(.data$observation_source_concept_id %in% !!c(osci_overall, osci_specific)) %>%
dplyr::select("person_id", "observation_source_concept_id", "value_source_concept_id", "value_source_value", "observation_date") %>%
dplyr::mutate(type = dplyr::case_when(.data$observation_source_concept_id %in% osci_specific ~ "Specific", TRUE ~ "Overall")) %>%
tidyr::pivot_wider(
id_cols = c("person_id", "observation_date", "type"), names_from = "value_source_value",
values_from = "value_source_concept_id"
) %>%
dplyr::group_by(.data$person_id, .data$type) %>%
dbplyr::window_order(.data$observation_date) %>%
tidyr::fill(-c("person_id", "observation_date", "type"), .direction = "down") %>%
dplyr::slice_max(order_by = observation_date, n = 1, with_ties = FALSE) %>%
dplyr::ungroup()
if (ncol(obs) == 3) {
return(NULL)
} # if there are no matching concepts
if (isTRUE(clean_answers)) {
pnta <- "PreferNotToAnswer"
sk <- "Skip"
dk <- "DontKnow"
} else {
pnta <- "PMI_PreferNotToAnswer"
sk <- "PMI_Skip"
dk <- "PMI_DontKnow"
}
obs %>%
dplyr::mutate(condition = dplyr::case_when(
dplyr::if_any(-c("person_id", "observation_date", "type"), ~ .x == !!specific_concept_id) ~ "Yes",
.data$type == "Specific" & dplyr::if_any(-c("person_id", "observation_date", "type"), ~ !.x %in% c(903079, 903096, 903087)) ~ "No",
dplyr::if_any(-c("person_id", "observation_date", "type"), ~ .x == 903079) ~ pnta,
dplyr::if_any(-c("person_id", "observation_date", "type"), ~ .x == 903096) ~ sk,
dplyr::if_any(-c("person_id", "observation_date", "type"), ~ .x == 903087) ~ dk,
TRUE ~ "No"
)) %>%
dplyr::select("person_id", "type", "condition", "observation_date") %>%
dplyr::group_by(.data$person_id) %>%
dplyr::slice_max(order_by = type, n = 1) %>%
dplyr::ungroup() %>%
dplyr::select("person_id", !!condition_name := "condition", !!condition_date := "observation_date") %>%
aou_compute()
}) %>%
purrr::reduce(dplyr::left_join, by = "person_id")
if (!is.null(all_health)) {
cohort_w_health <- dplyr::left_join(function_cohort, all_health, by = "person_id")
} else {
cli::cli_inform(c("x" = "No data found for health history questions."))
cohort_w_health <- function_cohort
}
} else {
cohort_w_health <- function_cohort
}
## this is the part for the regular survey questions, from the original function
if (length(regular_survey_concept_ids) > 0) {
# temporary observation table with responses
tmp <- dplyr::tbl(con, "observation") %>%
dplyr::filter(.data$observation_source_concept_id %in% regular_survey_concept_ids) %>%
# this is necessary because there may be multiple rows for a single person (hence full_join later)
dplyr::inner_join(dplyr::select(function_cohort, "person_id"), by = "person_id") %>%
aou_compute()
# for retrieving columns and pivoting
q <- paste0("observation_source_", question_output)
# need a prefix to fix column names if using concept_id as column names
if (question_output == "concept_id") {
pref <- "x"
} else {
pref <- ""
}
if (isTRUE(clean_answers)) {
tmp <- dplyr::mutate(tmp,
value_source_value = dplyr::case_when(
CONTAINS_SUBSTR(.data$value_source_value, "cope_") ~ value_source_value,
CONTAINS_SUBSTR(.data$value_source_value, "SDOH_") ~ value_source_value,
!CONTAINS_SUBSTR(.data$value_source_value, "_") ~ value_source_value,
TRUE ~ REGEXP_EXTRACT(.data$value_source_value, ".+_(.+_*.*)")
)
)
}
# go wide
wide <- tmp %>%
# numeric answers are stored in value_as_number
dplyr::mutate(value_source_value = dplyr::coalesce(.data$value_source_value, CAST(dplyr::sql("value_as_number AS STRING")))) %>%
# first combine all rows for a single person and question (e.g., multiple races)
dplyr::group_by(.data$person_id, .data$observation_date, dplyr::across(dplyr::all_of(q))) %>%
dplyr::summarise(
value_source_value = STRING_AGG(dplyr::sql("value_source_value order by value_source_value")),
.groups = "drop"
) %>%
dplyr::select(dplyr::all_of(c("person_id", !!q, "value_source_value", "observation_date"))) %>%
tidyr::pivot_wider(names_from = !!q, values_from = c("value_source_value", "observation_date"), names_prefix = pref)
if (length(question_output_arg) == 1 && question_output_arg[1] %in% c("concept_code", "concept_id")) {
wide <- wide %>%
dplyr::rename_with(.fn = gsub, pattern = "value_source_value_|value_source_concept_id", replacement = "") %>%
dplyr::rename_with(.fn = gsub, pattern = "observation_date_(.+)", replacement = "\\1_date")
} else {
# named vector to rename the columns if needed
concept_codes <- concept_lookup %>%
dplyr::filter(.data$concept_id %in% regular_survey_concept_ids) %>%
dplyr::select("concept_code", "cn")
nm <- c(
paste0("value_source_value_", concept_codes$concept_code),
paste0("observation_date_", concept_codes$concept_code)
)
names(nm) <- c(concept_codes$cn, paste0(concept_codes$cn, "_date"))
if (!all(nm %in% tolower(colnames(wide)))) {
cli::cli_abort(c("Wrong column names.",
"i" = "Try one question at a time, and report this issue at {.url https://github.com/roux-ohdsi/allofus/issues}"
))
}
# change to lower, then rename
wide <- wide %>%
dplyr::rename_with(tolower) %>%
dplyr::rename(dplyr::all_of(nm)) %>%
aou_compute()
}
# join back to original table
out <- dplyr::full_join(cohort_w_health, wide, by = "person_id")
} else { # end regular survey questions
out <- cohort_w_health
}
# a little reorganization
out <- out %>%
dplyr::relocate(dplyr::ends_with("_date") & !dplyr::any_of(colnames(cohort)), .after = dplyr::last_col())
# collect if indicated
if (isTRUE(collect)) {
out <- dplyr::collect(out, ...)
}
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.