R/utils.R

Defines functions .check_input_pair_df .check_for_patient_id .check_for_study_id

#' Check for study ID
#'
#' @param study_id
#'
#' @return stop if no arg
#' @keywords internal
#' @noRd
#'
.check_for_study_id <- function(study_id) {

  study_id %||%
    cli::cli_abort(c("You must provide a {.code study_id}. See {.code get_studies()} to view available studies on your database"))

  if(length(study_id) > 1) {
    cli::cli_abort(c("{.code length(study_id)} must be 1. You can only pass one {.code study_id} at a time"))
  } else {
    NULL
  }

}

#' Check for Patient ID
#'
#' @param patient_id
#'
#' @return stop if no sample_id arg
#' @keywords internal
#' @noRd
#'
.check_for_patient_id <- function(patient_id) {
  if (is.null(patient_id)) {
    stop("You must provide at least one patient id")
  }
}


#' Check Sample ID-Study ID and Patient ID-Study ID pairs input data frames
#'
#' @param input_df input data frame to check
#'
#' @return A valid `sample_study_pairs` or `patient_study_pairs` data frame. If `input_df` is NULL, it will return NULL.
#' @keywords internal
#' @noRd
#'
.check_input_pair_df <- function(input_df) {

  input_df %||% return(NULL)

  # may change this to: exists(arg, envir = rlang::caller_env())
  arg_name <- deparse(substitute(input_df))

  # must be a data.frame
  switch(!inherits(input_df, "data.frame"),
    rlang::abort("{arg_name} must be a `data.frame`")
  )

  # must have sample_id and study_id columns
  names(input_df) <- names(input_df) %>%
    stringr::str_remove_all(., stringr::fixed(" ")) %>%
    stringr::str_remove_all(., stringr::fixed("_")) %>%
    stringr::str_remove_all(., stringr::fixed(".")) %>%
    stringr::str_to_lower()

  switch(arg_name,
    "sample_study_pairs" = {
      final_names <- c("sample_id", "study_id")
    },
    "patient_study_pairs" = {
      final_names <- c("patient_id", "study_id")
      accepted_names <- c(final_names, stringr::str_remove_all(final_names, "_"))
    }
  )

  accepted_names <- c(final_names, stringr::str_remove_all(final_names, "_"))

  output_df <-
    if(!(any(stringr::str_detect(names(input_df), paste0(accepted_names[c(1, 3)], collapse = "|"))) &
               any(stringr::str_detect(names(input_df), paste0(accepted_names[c(2, 4)], collapse = "|"))))) {

      cli::cli_abort("{arg_name} must have the following columns: {final_names}")
    } else {
      select(input_df, (contains("sample") |
                          contains("patient")),
             contains("study")) %>%
        purrr::set_names(final_names)
    }


  # if molecular_profile_id passed, keep it
  optional_molec <- c("molecular_profile_id",
                      stringr::str_remove_all("molecular_profile_id", "_"))

  molec_col <- names(input_df)[stringr::str_detect(names(input_df),
                                          paste0(optional_molec, collapse = "|"))]

  if(length(molec_col > 0)) {
    output_df <- input_df %>%
      transmute("molecular_profile_id" = .data[[molec_col]]) %>%
      bind_cols(output_df, .)
  }

  output_df
}

#' Guess Study ID based on URL
#'
#' @param study_id a study id passed by a user
#' @param resolved_url the database URL
#'
#' @return a guess at which study_id a user may want to use
#' @keywords internal
#' @noRd
#'
.guess_study_id <- function(study_id, resolved_url) {

  study_id_guess <- study_id %||%
    switch(stringr::str_detect(resolved_url, "mskcc.org"), "mskimpact") %||%
    switch(resolved_url == "www.cbioportal.org/api",
           "msk_impact_2017")

  study_id_guess %||% cli::cli_abort("Unable to guess a {.code study_id} for your database.
                                   Please provide a {.code study_id}.")

  cli::cli_alert_info("No {.code study_id} provided. Using {.val {study_id_guess}} as default study")

  return(study_id_guess)
}


#' Get Molecular Profile Name for Data Type
#'
#' @description See: https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data
#  for definition of molecular profiles. CNA can have _cna, _gistic, _rae molecular profile names

#' @param data_type specify what type of data to return. Options are`mutation`, `cna`, `fusion`.
#' @param study_id study id for which to lookup profiles
#' @param base_url The database URL to query
#' If `NULL` will default to URL set with `set_cbioportal_db(<your_db>)`
#'
#' @return find molecular profile name for a specified data type
#' @keywords internal
#' @noRd
#'
.lookup_profile_name <- function(data_type, study_id, base_url) {

  profs <- available_profiles(study_id = study_id, base_url = base_url)

  resolved_profile <- switch(data_type,
                             mutation = filter(profs, .data$molecularAlterationType == "MUTATION_EXTENDED") %>%
                               dplyr::pull(.data$molecularProfileId),
                             fusion = filter(profs, .data$molecularAlterationType == "STRUCTURAL_VARIANT") %>%
                               dplyr::pull(.data$molecularProfileId),
                             cna = filter(profs, .data$molecularAlterationType == "COPY_NUMBER_ALTERATION" &
                                            .data$datatype == "DISCRETE") %>%
                               dplyr::pull(.data$molecularProfileId),
                             segment = "Not Applicable")


  if(length(resolved_profile) == 0) {
    cli::cli_abort("No molecular profile for `data_type = {data_type}` found in {.val {study_id}}.  See `available_profiles('{study_id}')`")
  }

  resolved_profile
}

#' Get Study ID From Passed Mutation Profile ID
#'

#' @param molecular_profile_id Molecular profile name
#' @param base_url The database URL to query
#' If `NULL` will default to URL set with `set_cbioportal_db(<your_db>)`
#'
#' @return find study ID for a specified molecular profile
#' @keywords internal
#' @noRd
#'
.lookup_study_name <- function(molecular_profile_id, study_id, base_url) {

  # study_id = NULL- will return all studies quietly
  # If study ID is supplied but wrong (doesn't exist in database) this will fail
  quiet_available_profiles <- purrr::quietly(available_profiles)

  profs <- tryCatch(

    # ** Maybe there can be a better API fail message that propagates throughout because base_url should  always be checked/throw error before
    #  any parameter issues.
    quiet_available_profiles(study_id = study_id, base_url = base_url),
                     error = function(e) cli::cli_abort("API Failed, check your database connection ({.code test_cbioportal_db()}) and make sure {.val study_id:}{.code {study_id}} exists ({.code available_studies()})"))


  resolved_study_id <- profs$result %>%
    filter(.data$molecularProfileId == molecular_profile_id) %>%
    dplyr::pull(.data$studyId)

  if(length(resolved_study_id) == 0) {
    cli::cli_abort("Molecular profile {.val {molecular_profile_id}} doesn't exist, or molecular profile doesn't match the {.val study_id} you passed. See {.code available_profiles()} or {.code available_studies()}")
  }

  resolved_study_id

}


#' Get Hugo Symbols From Genomics Data Pulls
#'

#' @param df data frame resulting from genomic data pulls
#' @param base_url The database URL to query
#' If `NULL` will default to URL set with `set_cbioportal_db(<your_db>)`
#'
#' @return a dataframe that matches input data frame but with hugoGeneSymbol column
#' @keywords internal
#' @noRd
#'
.lookup_hugo <- function(df, base_url) {

  if(any(stringr::str_detect(names(df), "hugoGeneSymbol"))) {
    cli::cli_warn("{.field 'hugoGeneSymbol'} column already exists. Overwriting this column. Please set {.code add_hugo = FALSE} to supress this}")
    df <- df %>%
      select(-all_of('hugoGeneSymbol'))
  }

  hugo <- get_hugo_symbol(unique(df$entrezGeneId)) %>%
    select(-"type")

  df_with_hugo <- left_join(df, hugo, by = "entrezGeneId" ) %>%
    select("hugoGeneSymbol", "entrezGeneId", everything())

  # If there happens to be more than 1 hugo per entrez
  if(!(nrow(df_with_hugo) == nrow(df))) {
    cli::cli_abort("Could not automatically add Hugo Symbol.
                 To proceed, please set {.code add_hugo = FALSE}.
                 You can investigate issue by calling {.code get_hugo(<your df>$entrezGeneId)} on results")
  }

  # When hugo symbol unknown, return entrez_gene with signifier
  df_with_hugo <- df_with_hugo %>%
    mutate(hugoGeneSymbol = case_when(is.na(hugoGeneSymbol) ~ paste0("unk_gene_", entrezGeneId),
                                      TRUE ~ hugoGeneSymbol))

  return(df_with_hugo)


}


#' Get Entrez Gene IDs for a given panel ID
#'

#' @param panel_id Panel ID for which to retrieve gene IDs (see `available_gene_panels()` for options).
#' If NULL, it will return NULL
#' @param base_url The database URL to query
#' If `NULL` will default to URL set with `set_cbioportal_db(<your_db>)`
#'
#' @return a vector of Entrez Gene IDs
#' @keywords internal
#' @noRd
#'
.get_panel_entrez <- function(panel_id, base_url) {

  # if NULL, return NULL
  panel_id %||% return(panel_id)

  res <- tryCatch(get_gene_panel(panel_id = panel_id, base_url = base_url),
    error = function(e) {
      cli::cli_abort("There was an error pullling genes for panel ID: {.code {panel_id}}. See {.code available_gene_panels()} for supported panels.")
    })

  entrez_ids <- res %>%
    dplyr::pull(.data$entrezGeneId) %>%
    unique()

  return(entrez_ids)

    }


#' Check if NULL
#'
#' @param x any R object or expression
#' @noRd
#' @keywords internal
#'
#' @examples
#' is_not_null(NULL)
is_not_null <- function(x) {
  !is.null(x)
}

Try the cbioportalR package in your browser

Any scripts or data that you put into this service are public.

cbioportalR documentation built on Oct. 6, 2023, 1:07 a.m.