R/assign_study_data_types.R

Defines functions get_valid_values_from_json_schema summarize_attribute assign_study_data_types

Documented in assign_study_data_types get_valid_values_from_json_schema summarize_attribute

#' Summarize data types for the study
#'
#' Data types are summarized, or "rolled-up", for the study based on its child file annotations.
#' Summary values are added back as and overwrites the current `dataType` annotation for the study.
#' See also the related `update_study_annotations`, where study-level annotations are *rolled down* to child files.
#' Note that under-the-hood this now wraps a generalized util `summarize_attribute`.
#'
#' @param study_table_id Synapse ID of reference portal study table. Used to get study ids.
#' @param fileview_id Synapse ID of the reference portal fileview.
#' @param id_col Name of the study id column in `study_table_id` and `fileview_id`. Defaults to `studyId`.
#' @param attribute Attribute being summarized using fileview. Defaults to `dataType`.
#' @param dry_run Default = TRUE. Whether to update as well or just return list of annotation objects.
#' @return List of annotations objects.
#' @examples
#' \dontrun{
#' assign_study_data_types(study_table_id = 'syn52694652',
#'                         fileview_id = 'syn16858331',
#'                         id_col = 'studyId',
#'                         attribute = 'dataType',
#'                         dry_run = T)
#'}
#' @export
assign_study_data_types <- function(study_table_id,
                                    fileview_id,
                                    id_col = "studyId",
                                    attribute = "dataType",
                                    dry_run = TRUE) {

  .check_login()

  # get studies within scope from study table
  studies <- table_query(table_id = study_table_id, columns = id_col) %>% unlist()

  # query the fileview
  query <- glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id}
                      where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}")

  check_fun <- function(values) {
    if(length(values) > 50) {
      warning(glue::glue("Over 50 values found, which will break the schema."))
      return(FALSE)
    } else {
      return(TRUE)
    }
  }

  summarize_attribute(summary_query = query,
                      attribute = attribute,
                      entity_id = id_col,
                      dry_run = dry_run,
                      check_fun = check_fun)
}


#' Helper summarization util
#'
#' Given some table X that provides values, summarize the values for an attribute and add summary as annotations on some entity.
#' The entity could be X itself or another entity Y, e.g. a parent container entity.
#' Example 1: With datasets, summarize `species` for all the files.
#' Example 2: With projects, summarize `dataType` for all the files (in fact, see `assign_study_data_types`).
#'
#' @param summary_query Query (usually of a fileview) that returns appropriate aggregation per row. You may need to add `group_concat`, `distinct`, and or `unnest` to the query to get the correct list of distinct values depending on your data (e.g.`select group_concat(distinct unnest(tumorType)) as tumorType from ...`).
#' @param attribute Name of attribute to update as annotation.
#' @param entity_id Either a single valid Synapse id of the entity for which to update the attribute *or* a column present in `summary_query` that stores ids.
#' @param dry_run Default = `TRUE`. Whether to update as well or just return list of annotation objects.
#' @param check_fun An optional custom check function to apply to the values being updated in order for update to go through. Should return a boolean. Used only if dry_run = `FALSE`.
#' It can be tailored towards the attribute/entity being updated (i.e. taking into account the schema and valid values).
#' @export
summarize_attribute <- function(summary_query,
                                attribute,
                                entity_id = NULL,
                                dry_run = TRUE,
                                check_fun = NULL) {

  values <- .syn$tableQuery(summary_query,includeRowIdAndRowVersion = F)$asDataFrame()
  meta <- lapply(values[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # in case of stray whitespaces
  if(is_valid_syn_id(entity_id)) {
    names(meta) <- entity_id
  } else {
    names(meta) <- values[[entity_id]]
  }

  result_list <- list()
  for(entity in names(meta)) {
    entity_meta <- .syn$get_annotations(entity)
    entity_meta[attribute] <- meta[[entity]]
    result_list[[entity]] <- entity_meta
    if(!dry_run) {
      if(is.function(check_fun)) {
        if(check_fun(meta[[entity]])) .syn$set_annotations(entity_meta) else message("Skipped update for {entity}.")
      } else {
        .syn$set_annotations(entity_meta)
        message(glue::glue("Updated {entity} {attribute}."))
      }
    }
  }
  invisible(result_list)
}


#' Retrieve valid subclasses of a value in a JSON-LD schema
#' @description Retrieve valid subclasses of a value in a JSON-LD schema generated by schematic.
#' @param schema_url Default: the NF-OSI JSON-LD schema.
#' @param parent_name Default = DataType. The value for which you'd like to find the associated subclasses.
#' @param parent_context Default = bts. The JSON-LD context for the value in question.
#' @return A character vector of values.
#' @export
get_valid_values_from_json_schema <- function(schema_url = 'https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld',
                                              parent_name = 'DataType',
                                              parent_context = 'bts'){

  parent_id <- paste0(parent_context, ':', parent_name)

  subclasses <-
    jsonlite::fromJSON(schema_url) %>%
    purrr::pluck("@graph") %>%
    dplyr::filter(purrr::map_lgl(`rdfs:subClassOf`, ~ parent_id %in% .x)) %>%
    dplyr::pull(`sms:displayName`)

  subclasses
}
nf-osi/nfportalutils documentation built on Feb. 26, 2024, 1:05 p.m.