R/genomics_by_study.R

Defines functions get_genetics_by_study get_segments_by_study get_fusions_by_study get_cna_by_study get_mutations_by_study .get_data_by_study

Documented in get_cna_by_study .get_data_by_study get_fusions_by_study get_genetics_by_study get_mutations_by_study get_segments_by_study

# Generalized Data Pull Function ------------------------------------------------------------

#' Internal Function to Get Mutations/CNA/Fusion By Study ID
#'
#' @description Endpoints for retrieving mutation and cna data are structurally similar.
#' This internal function allows you to pull data from either endpoint. It has
#' logic for sensible default guesses at `study_id` and `molecular_profile_id` when those are `NULL`
#'
#' @param study_id A study ID to query mutations. If NULL, guesses study ID based
#' on molecular_profile_id.
#' @param molecular_profile_id a molecular profile to query mutations.
#' If NULL, guesses molecular_profile_id based on study ID.
#' @param data_type specify what type of data to return. Options are`mutation`, `cna`, `fusion`, or`structural_variant` (same as `fusion`),
#' and `segment` (copy number segmentation data)..
#' @param add_hugo Logical indicating whether `HugoGeneSymbol` should be added to your resulting data frame, if not already present in raw API results.
#' Argument is `TRUE` by default. If `FALSE`, results will be returned as is (i.e. any existing Hugo Symbol columns in raw results will not be removed).
#' @param base_url The database URL to query
#' If `NULL` will default to URL set with `set_cbioportal_db(<your_db>)`
#'
#' @return a dataframe of mutations, CNAs or structural variants
#' @export
#' @keywords internal
#'
#' @examples
#' \dontrun{
#' set_cbioportal_db("public")
#' .get_data_by_study(study_id = "prad_msk_2019", data_type = "cna")
#' .get_data_by_study(study_id = "prad_msk_2019", data_type = "mutation")
#' .get_data_by_study(study_id = "prad_msk_2019", data_type = "fusion")
#'
#' .get_data_by_study(molecular_profile_id = "prad_msk_2019_cna", data_type = "cna")
#' .get_data_by_study(molecular_profile_id = "prad_msk_2019_mutations", data_type = "mutation")
#' .get_data_by_study(molecular_profile_id = "prad_msk_2019_structural_variants", data_type = "fusion")
#' }
#'
.get_data_by_study <- function(study_id = NULL,
                              molecular_profile_id = NULL,
                              data_type = c("mutation", "cna", "fusion", "structural_variant", "segment"),
                              base_url = NULL,
                              add_hugo = TRUE) {



  # Check Arguments  -----------------------------------------------------------

  # For queries, you need both study_id and molecular_profile_id

  # If both study_id & molecular_profile_id are NULL- stop
  if(is.null(study_id) & is.null(molecular_profile_id)) {
    cli::cli_abort("You must provide a {.code study_id} or a {.code molecular_profile_id}. See {.code available_profiles(<study_id>)} to view available profiles for a study")
  }

  # fusions and structural_variants are the same. fusion is older nomenclature.
  data_type <- match.arg(data_type)

  data_type <- dplyr::case_when(
    data_type == "structural_variant" ~ "fusion",
    TRUE ~ data_type)

  if(data_type != "segment") {
    # study ID provided and profile is NULL
    # If study ID is not correct, informative error thrown
    molecular_profile_id <- molecular_profile_id %||%
      .lookup_profile_name(data_type, study_id, base_url = base_url)


    # if study_id is NULL or not NULL molecular profile ID can't be NULL
    study_id <- .lookup_study_name(molecular_profile_id = molecular_profile_id,
                         study_id = study_id,
                         base_url = base_url)
  }


  # this text goes in query URL path
  url_data_type <- switch(
    data_type,
    "mutation" = "mutations",
    "fusion" = "structural-variant",
    "cna" = "discrete-copy-number",
    "segment" = "copy-number-segments")

  # Some API endpoints require that you pass a sample list ID. All studies should have an "all" list which is the default for this function
  sample_list_id <- paste0(study_id, "_all")


  # MUTATION/CNA query ----------------------------------------------------------------------

  # GET: /molecular-profiles/{molecularProfileId}/molecular-data?{sample_list_id}

  if(data_type %in% c("mutation", "cna")) {

    url_list <- paste0(
      "molecular-profiles/", molecular_profile_id,
      "/",
      url_data_type,
      "?sampleListId=",
      sample_list_id
    )

    df <- purrr::map_dfr(url_list, function(x) {
      res <- cbp_api(x, base_url = base_url)

      if (length(res$content) > 0) {
          result <- purrr::map_dfr(res$content, ~ purrr::list_flatten(.x))
        } else {
          result <- NULL
        }
        result
      })
    }

  # FUSIONS/SEGMENTS query ----------------------------------------------------------------------

  # POST: /structural-variant/fetch ---
  # BODY: sampleMolecularIdentifiers: molecularProfileId, sampleId

  # Fusions endpoint works a little differently than Mutation and CNA
  # Instead of passing a sample list, you pass individual sample IDs (retrieved using list)
  # Main Goal in this function is to return all results without specifying specific genes to query (as is needed in other endpoints).

  if(data_type %in% c("fusion", "segment")) {

    # Need to get all sample IDs in study for fusion retrieval
    sample_list_url <- paste0("sample-lists/", sample_list_id)

    all_samples_in_study <- cbp_api(
      url_path = sample_list_url,
      base_url = base_url
    )$content$sampleIds %>%
      unlist() %>% unique()


    fus_imp <- purrr::map_dfr(all_samples_in_study, function(x) {

      body <- switch(data_type,
             fusion = {
               list(
                 sampleMolecularIdentifiers = as.data.frame(list(
                   molecularProfileId = jsonlite::unbox(molecular_profile_id),
                   sampleId = x)))
               },
             segment = {
               data.frame(
                 "sampleId" = x,
                 "studyId" = study_id)
               })

      fus <- cbp_api(
        url_path = paste0(url_data_type, "/fetch?"),
        method = "post",
        body = body,
        base_url = base_url
      )

      if(length(fus$content) > 0) {
        result <- purrr::map_dfr(fus$content, ~purrr::list_flatten(.x))
      } else {
        result <- NULL
      }

    })

    df <- fus_imp

  }


  # * Add Hugo Symbol & Return -----

  if(add_hugo) {

    # Fusions already has hugo by default from API, segment doesn't need
    df <- switch(data_type,
                 "fusion" = df,
                 "segment" = df,
                 "mutation" = if(nrow(df) > 0) .lookup_hugo(df, base_url = base_url),
                 "cna" = if(nrow(df) > 0) .lookup_hugo(df, base_url = base_url))

  }

  alert_string = ifelse(data_type != "segment",
                        "Returning all data for the {.val {molecular_profile_id}} molecular profile in the {.val {study_id}} study",
                        "Returning all {.val copy number segmentation} data for the {.val {study_id}} study")

  if(data_type == "segment" & nrow(df) == 0) {
    cli::cli_abort("No {.val copy number segmentation} data found for {.val {study_id}} study")
  }

  cli::cli_alert_info(alert_string)
  return(df)


}


# Wrapper Functions ------------------------------------------------------------

#' Get Mutations By Study ID
#'
#' @inheritParams .get_data_by_study
#'
#' @return A dataframe of mutations (maf file format)
#' @export
#' @examples
#' \dontrun{
#' get_mutations_by_study(study_id = "prad_msk_2019")
#' get_mutations_by_study(molecular_profile_id = "prad_msk_2019_mutations")
#' }
#'
get_mutations_by_study <- function(study_id = NULL,
                                  molecular_profile_id = NULL,
                                  add_hugo = TRUE,
                                  base_url = NULL) {

  .get_data_by_study(study_id = study_id,
                    molecular_profile_id = molecular_profile_id,
                    data_type = c("mutation"),
                    add_hugo = add_hugo,
                    base_url = base_url)
}

#' Get CNA By Study
#'
#' @inheritParams .get_data_by_study
#' @return A dataframe of CNAs
#' @export
#' @examples
#' \dontrun{
#' get_cna_by_study(study_id = "prad_msk_2019")
#' get_cna_by_study(molecular_profile_id = "prad_msk_2019_cna")
#' }

get_cna_by_study <- function(study_id = NULL,
                             molecular_profile_id = NULL,
                             add_hugo = TRUE,
                             base_url = NULL) {

  .get_data_by_study(study_id = study_id,
                    molecular_profile_id = molecular_profile_id,
                    data_type = c("cna"),
                    add_hugo = add_hugo,
                    base_url = base_url)
}


#' Get Fusions By Study
#'
#' @inheritParams .get_data_by_study
#' @return A dataframe of fusions
#' @export
#' @aliases get_structural_variants_by_study
#' @examples
#' \dontrun{
#' # These return the same results
#' get_fusions_by_study(molecular_profile_id = "prad_msk_2019_structural_variants")
#'
#' get_structural_variants_by_study(molecular_profile_id =
#'        "prad_msk_2019_structural_variants")
#'        }

get_fusions_by_study <- function(study_id = NULL,
                             molecular_profile_id = NULL,
                             base_url = NULL) {

  .get_data_by_study(study_id = study_id,
                    molecular_profile_id = molecular_profile_id,
                    data_type = c("fusion"),
                    # this doesn't matter. `add_hugo` doesn't do anything for fusion data
                    add_hugo = TRUE,
                    base_url = base_url)
}

#' @rdname get_fusions_by_study
#' @export
get_structural_variants_by_study <- get_fusions_by_study


#' Get Copy Number Segmentation Data By Study
#'
#' @inheritParams .get_data_by_study
#' @return A dataframe of CNA segments
#' @export
#' @examples
#' \dontrun{
#' get_segments_by_study(study_id = "prad_msk_2019")
#' get_segments_by_study(molecular_profile_id = "prad_msk_2019_cna")
#' }

get_segments_by_study <- function(study_id = NULL,
                             add_hugo = TRUE,
                             base_url = NULL) {

  .get_data_by_study(study_id = study_id,
                     molecular_profile_id = NULL,
                     data_type = c("segment"),
                     add_hugo = add_hugo,
                     base_url = base_url)
}



#' Get All Genomic Information By Study
#'
#' @inheritParams .get_data_by_study
#' @param return_segments Default is `FALSE` where copy number segmentation data won't be returned in addition to the mutation, cna and structural variant data.
#' `TRUE` will return any available segmentation data with results.
#' @return A list of mutations, cna and structural variants (including fusions), if available. Will also return copy number segmentation data if `return_segments = TRUE`.
#' @export
#' @examples
#' \dontrun{
#' get_genetics_by_study(study_id = "prad_msk_2019")
#' }
#
get_genetics_by_study <- function(study_id = NULL,
                                  add_hugo = TRUE,
                                  base_url = NULL,
                                  return_segments = FALSE) {

  # ** Not using `.check_for_study_id()` here because we allow no study ID to be passed,
  # but still don't allow study_id > 1. Maybe generalized that check function to
  # Make each check optional?
  if(length(study_id) > 1) {
    cli::cli_abort(c("{.code length(study_id)} must be 1. You can only pass one {.code study_id} at a time"))}

  data_types <- c("mutation", "cna", "structural_variant")

  if(return_segments) {
    data_types <- c(data_types, "segment")
  }

  safe_get_data <- purrr::safely(.get_data_by_study, quiet = TRUE)

 res <-  data_types %>%
   purrr::set_names() %>%
   purrr::map(., function(x) {
                       safe_get_data(study_id = study_id,
                                          molecular_profile_id = NULL,
                                          data_type = x,
                                     add_hugo = add_hugo,
                                     base_url = base_url)
                       })

 genetics <- purrr::compact(purrr::map(res, "result"))
 errors <- purrr::compact(purrr::map(res, "error"))

 switch(!purrr::is_empty(errors),
        purrr::imap(errors, ~cli_alert_warning(c("No {.val { .y}} data returned. Error:  ",
                                          # why no red :(
                                        cli::col_red('{ .x$message}'))))
        )

 genetics
}

Try the cbioportalR package in your browser

Any scripts or data that you put into this service are public.

cbioportalR documentation built on Oct. 6, 2023, 1:07 a.m.